Open Daniel-Mietchen opened 4 years ago
I have tested and refined the SPARQL query for weeks and thought putting it into Scholia would be a quick act, but no, could not get it to work there.
Progress so far is in https://github.com/fnielsen/scholia/tree/feat/ngrams-for-topics-1129 , which also has a debug section in topic_missing.html that I am using to try to identify what the problem is.
Some intermediate results from the debugging, as per screenshots for topic/Q1659584/missing (ImageJ) and /topic/Q84263196/missing (COVID-19).
Note that the focus of the debugging queries is on seeing whether the results render, with some of the missing bits hardcoded, which explains why ImageJ is showing up under COVID-19.
Here is a more refined query for this — using climate change (Q125928) as an example, it generates a list of strings commonly found in titles of publications tagged as being about that topic:
PREFIX target: <http://www.wikidata.org/entity/Q125928>
# Most frequent n-grams (for n <= 8) from a set of up to 1000 publications on a topic
SELECT DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
WITH
{ # Generating a list of entities to be analyzed
SELECT ?Publication
{
SERVICE bd:sample { ?Publication wdt:P921 target: . bd:serviceParam bd:sample.limit 1000 }
}
} AS %items
WITH
{ # Preprocessing the titles
SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
{
INCLUDE %items
?Publication wdt:P1476 ?Title.
BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength)
FILTER(LANG(?Title)="en")
# Basic processing of the titles
BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
?ClearTitle,
?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
AS ?Seeds )
}
} AS %titles
WITH
{ # Generating a list of regexes to look for the NumericValue-th word in a string
# Based on https://w.wiki/KG$ by Jura1
SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue
{
?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue .
FILTER( ?NumericValue > 0 )
FILTER( ?NumericValue < 151)
BIND("^([^ ]+ ){" AS ?RegexStart)
BIND("}([^ ]+) .*" AS ?RegexEnd)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4)
}
} AS %regexes
WITH
{ # Applying the regexes to the titles to extract ngrams, and counting occurrences of the ngrams across titles
SELECT
DISTINCT ?Ngram
?N
(COUNT(DISTINCT ?Title) AS ?Count)
?Length
?Dashes
(( ?Count * ?Length * ( (?Dashes +1) / ?N)
) AS ?Score)
(SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
{
INCLUDE %regexes
INCLUDE %titles
BIND(
(CONCAT(
REPLACE(?Seeds, ?Regex1, "$1"), " ",
REPLACE(?Seeds, ?Regex1, "$2"), " ",
REPLACE(?Seeds, ?Regex2, "$1"), " ",
REPLACE(?Seeds, ?Regex2, "$2"), " ",
REPLACE(?Seeds, ?Regex3, "$1"), " ",
REPLACE(?Seeds, ?Regex3, "$2"), " ",
REPLACE(?Seeds, ?Regex4, "$1"), " ",
REPLACE(?Seeds, ?Regex4, "$2")
)
) AS ?NgramCandidate)
BIND(
(REPLACE
(REPLACE
(REPLACE
(REPLACE
(STR(?NgramCandidate),"([;:])",""),
"(^\\s+)",""),
"(\\s+$)",""),
"([ ]{2,})"," ")
) AS ?Ngram)
BIND(STRLEN(?Ngram) AS ?Length)
FILTER (?Length > 3 )
FILTER (?Length <= ?ClearTitleLength )
BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", ""))) as ?Dashes)
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
HAVING(?Count > 1)
} AS %ngrams
WHERE {
INCLUDE %ngrams
# Exclude Ngrams starting or ending with any of a set of blacklisted words
BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
FILTER (!REGEX(?Ngram, ?RegexBlackStart))
FILTER (!REGEX(?Ngram, ?RegexBlackEnd))
?ExamplePub wdt:P1476 ?ExamplePubTitle.
FILTER(LANG(?ExamplePubTitle)="en")
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
LIMIT 200
to assist with the curation of related content, e.g. as demoed via https://www.wikidata.org/wiki/Wikidata:University_of_Virginia/Listeria/UVa_people/Common_words_in_titles_of_UVA-coauthored_publications_without_P921_(main_subject)_statement .