WDscholia / scholia

Wikidata-based scholarly profiles
https://scholia.toolforge.org
Other
225 stars 81 forks source link

On missing pages, add queries to highlight lexemes contained in the titles of relevant publications #1129

Open Daniel-Mietchen opened 4 years ago

Daniel-Mietchen commented 4 years ago

to assist with the curation of related content, e.g. as demoed via https://www.wikidata.org/wiki/Wikidata:University_of_Virginia/Listeria/UVa_people/Common_words_in_titles_of_UVA-coauthored_publications_without_P921_(main_subject)_statement .

Daniel-Mietchen commented 4 years ago

I have tested and refined the SPARQL query for weeks and thought putting it into Scholia would be a quick act, but no, could not get it to work there.

Progress so far is in https://github.com/fnielsen/scholia/tree/feat/ngrams-for-topics-1129 , which also has a debug section in topic_missing.html that I am using to try to identify what the problem is.

Daniel-Mietchen commented 4 years ago

Some intermediate results from the debugging, as per screenshots for topic/Q1659584/missing (ImageJ) and /topic/Q84263196/missing (COVID-19).

Note that the focus of the debugging queries is on seeing whether the results render, with some of the missing bits hardcoded, which explains why ImageJ is showing up under COVID-19.

Screenshot_2020-05-14 Scholia(1)

Screenshot_2020-05-14 Scholia(2)

Daniel-Mietchen commented 2 years ago

Here is a more refined query for this — using climate change (Q125928) as an example, it generates a list of strings commonly found in titles of publications tagged as being about that topic:

PREFIX target: <http://www.wikidata.org/entity/Q125928>
# Most frequent n-grams (for n <= 8) from a set of up to 1000 publications on a topic
SELECT DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle

WITH
{ # Generating a list of entities to be analyzed
  SELECT ?Publication
   { 
      SERVICE bd:sample { ?Publication wdt:P921 target: . bd:serviceParam bd:sample.limit 1000 }   
   }
} AS %items 
WITH
{ # Preprocessing the titles
  SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
   { 
      INCLUDE %items
      ?Publication wdt:P1476 ?Title.
      BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
      BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength) 
      FILTER(LANG(?Title)="en") 
      # Basic processing of the titles
      BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
      BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
      BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
                            ?ClearTitle, 
                            ?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
                     AS ?Seeds )
   }
} AS %titles 
WITH
{ # Generating a list of regexes to look for the NumericValue-th word in a string     
  # Based on https://w.wiki/KG$ by Jura1
  SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue 
    { 
      ?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue . 
      FILTER( ?NumericValue > 0 ) 
      FILTER( ?NumericValue < 151)
      BIND("^([^ ]+ ){" AS ?RegexStart)
      BIND("}([^ ]+) .*" AS ?RegexEnd)
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2) 
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3) 
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4) 
    }
} AS %regexes 
WITH
{ # Applying the regexes to the titles to extract ngrams, and counting occurrences of the ngrams across titles
  SELECT 
    DISTINCT ?Ngram 
    ?N
    (COUNT(DISTINCT ?Title) AS ?Count)
    ?Length
    ?Dashes
    (( ?Count * ?Length * ( (?Dashes +1) / ?N) 
     ) AS ?Score)
    (SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
      { 
        INCLUDE %regexes
        INCLUDE %titles
        BIND( 
          (CONCAT(
            REPLACE(?Seeds, ?Regex1, "$1"), " ", 
            REPLACE(?Seeds, ?Regex1, "$2"), " ", 
            REPLACE(?Seeds, ?Regex2, "$1"), " ", 
            REPLACE(?Seeds, ?Regex2, "$2"), " ", 
            REPLACE(?Seeds, ?Regex3, "$1"), " ", 
            REPLACE(?Seeds, ?Regex3, "$2"), " ", 
            REPLACE(?Seeds, ?Regex4, "$1"), " ", 
            REPLACE(?Seeds, ?Regex4, "$2")
          )
        ) AS ?NgramCandidate) 

        BIND( 
          (REPLACE
           (REPLACE
            (REPLACE
             (REPLACE
              (STR(?NgramCandidate),"([;:])",""),
              "(^\\s+)",""),
             "(\\s+$)",""),
            "([ ]{2,})"," ")
          ) AS ?Ngram) 

        BIND(STRLEN(?Ngram) AS ?Length) 
        FILTER (?Length > 3 )  
        FILTER (?Length <= ?ClearTitleLength )  

        BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
        BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", "")))  as ?Dashes)
      }
  GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
  HAVING(?Count > 1)
} AS %ngrams 
WHERE {
  INCLUDE %ngrams 
  # Exclude Ngrams starting or ending with any of a set of blacklisted words
  BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
  BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
  BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
  FILTER (!REGEX(?Ngram, ?RegexBlackStart))
  FILTER (!REGEX(?Ngram, ?RegexBlackEnd))

  ?ExamplePub wdt:P1476 ?ExamplePubTitle.
  FILTER(LANG(?ExamplePubTitle)="en") 
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
LIMIT 200