I have tested and refined the SPARQL query for weeks and thought putting it into Scholia would be a quick act, but no, could not get it to work there.
Progress so far is in , which also has a debug section in topic_missing.html that I am using to try to identify what the problem is.
Some intermediate results from the debugging, as per screenshots for topic/Q1659584/missing (ImageJ) and /topic/Q84263196/missing (COVID-19).
Note that the focus of the debugging queries is on seeing whether the results render, with some of the missing bits hardcoded, which explains why ImageJ is showing up under COVID-19.
Here is a more refined query for this — using climate change (Q125928) as an example, it generates a list of strings commonly found in titles of publications tagged as being about that topic:
PREFIX target: <>
# Most frequent n-grams (for n <= 8) from a set of up to 1000 publications on a topic
SELECT DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
{ # Generating a list of entities to be analyzed
SELECT ?Publication
SERVICE bd:sample { ?Publication wdt:P921 target: . bd:serviceParam bd:sample.limit 1000 }
} AS %items
{ # Preprocessing the titles
SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
INCLUDE %items
?Publication wdt:P1476 ?Title.
BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength)
# Basic processing of the titles
BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
AS ?Seeds )
} AS %titles
{ # Generating a list of regexes to look for the NumericValue-th word in a string
# Based on$ by Jura1
SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue
?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue .
FILTER( ?NumericValue > 0 )
FILTER( ?NumericValue < 151)
BIND("^([^ ]+ ){" AS ?RegexStart)
BIND("}([^ ]+) .*" AS ?RegexEnd)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4)
} AS %regexes
{ # Applying the regexes to the titles to extract ngrams, and counting occurrences of the ngrams across titles
(COUNT(DISTINCT ?Title) AS ?Count)
(( ?Count * ?Length * ( (?Dashes +1) / ?N)
) AS ?Score)
(SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
INCLUDE %regexes
INCLUDE %titles
REPLACE(?Seeds, ?Regex1, "$1"), " ",
REPLACE(?Seeds, ?Regex1, "$2"), " ",
REPLACE(?Seeds, ?Regex2, "$1"), " ",
REPLACE(?Seeds, ?Regex2, "$2"), " ",
REPLACE(?Seeds, ?Regex3, "$1"), " ",
REPLACE(?Seeds, ?Regex3, "$2"), " ",
REPLACE(?Seeds, ?Regex4, "$1"), " ",
REPLACE(?Seeds, ?Regex4, "$2")
) AS ?NgramCandidate)
"([ ]{2,})"," ")
) AS ?Ngram)
BIND(STRLEN(?Ngram) AS ?Length)
FILTER (?Length > 3 )
FILTER (?Length <= ?ClearTitleLength )
BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", ""))) as ?Dashes)
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
HAVING(?Count > 1)
} AS %ngrams
INCLUDE %ngrams
# Exclude Ngrams starting or ending with any of a set of blacklisted words
BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
FILTER (!REGEX(?Ngram, ?RegexBlackStart))
FILTER (!REGEX(?Ngram, ?RegexBlackEnd))
?ExamplePub wdt:P1476 ?ExamplePubTitle.
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
to assist with the curation of related content, e.g. as demoed via .