Closed WolfgangFahl closed 2 years ago
catmandu convert kxp --query 'pica.1001="b" and pica.1045="fam" and pica.1049="669696374"' to pp | picadata -p 003@,036E,017G > ceurws.pica
https://github.com/WolfgangFahl/pyCEURmake/blob/main/scripts/k10plus
#
# get CEUR-WS Proceedings records by Volume with K10 Plus PPN Id
#
# WF 2022-08-13
#
# the Volume number P478 is sometimes available with the proceedings item and sometimes as a qualifier
# of
#
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?sVolume ?ppnId ?item ?itemLabel
WHERE {
?item rdfs:label ?itemLabel.
FILTER(LANG(?itemLabel) = "en")
# Instance of Proceedings
?item wdt:P31 wd:Q1143604.
# Part of the series
?item p:P179 ?partOfTheSeries.
# CEUR Workshop proceedings
?partOfTheSeries ps:P179 wd:Q27230297.
# Volumes via a a qualifier of the part of the series relation
?partOfTheSeries pq:P478 ?sVolume.
# K10plus PPN ID
?item wdt:P6721 ?ppnId.
} ORDER BY DESC (xsd:integer(?sVolume))
wb add-claim <entity> <property> <value>
example
wd add-claim Q113545796 P6721 1741843944
#!/bin/bash
# WF 2022-08-18
# k10plus PPN Id matching
picafile="$HOME/.ceurws/ceurws.pica"
jsonmatch=$HOME/.ceurws/ceurws-ppn.json
if [ ! -f $picafile ]
then
echo "$picafile missing see https://github.com/WolfgangFahl/pyCEURmake/issues/26"
exit 1
fi
createJson() {
local l_jsonmatch="$1"
sparql=/tmp/ceurw-sparql.$$
cat << EOF > $sparql
#
# get CEUR-WS Proceedings records by Volume with linked Event and EventSeries
#
# WF 2022-08-13
#
# the Volume number P478 is sometimes available with the proceedings item and sometimes as a qualifier
# of
#
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?sVolume ?ppnToSet ?ppnId ?item ?itemLabel
WHERE {
VALUES (?sVolume ?ppnToSet) {
EOF
# get ppns
# 003@ $01663037973
# 036E $aCEUR workshop proceedings$lvol-2314
cat $picafile | awk '
/003@/ {
gsub("[$]0","",$2)
ppn=$2
}
$1="036E" && /aCEUR/ {
#.l([vV]ol(ume).)
# quirks
# ("proceedikngs$lvol-1185" "1023022214")
# ("1493$pAI*IA" "848525124")
# ("1492,2" "848521706")
# ("1492,1" "848521617")
gsub("[Pp]roceedi[k]?ngs.l([vV]ol(ume)?[-.]?)?","",$4)
gsub(".pAI*IA","",$4)
if (length($4)==0) { volf=$5 } else { volf=$4 }
#("3185" "P")
printf(" (\x22%s\x22 \x22%s\x22)\n",volf,ppn)
}
' >> $sparql
cat << EOF >> $sparql
}
?item rdfs:label ?itemLabel.
FILTER(LANG(?itemLabel) = "en")
# Instance of Proceedings
?item wdt:P31 wd:Q1143604.
# Part of the series
?item p:P179 ?partOfTheSeries.
# CEUR Workshop proceedings
?partOfTheSeries ps:P179 wd:Q27230297.
# Volumes via a a qualifier of the part of the series relation
?partOfTheSeries pq:P478 ?sVolume.
# K10plus PPN ID
OPTIONAL {
?item wdt:P6721 ?ppnId.
}
} ORDER BY DESC (xsd:integer(?sVolume))
EOF
#atom $sparql
sparqlquery -qf $sparql -f json > $l_jsonmatch
}
if [ ! -f $jsonmatch ]
then
createJson $jsonmatch
else
echo "$jsonmatch already exists"
fi
jq . $jsonmatch | awk '
BEGIN { FS="\x22"}
/itemLabel/ { next }
/item/ {
gsub("http://www.wikidata.org/entity/","",$4)
item=$4
#print NR,item
next
}
/ppnToSet/ {
ppnToSet=$4
#print NR,ppnToSet
next
}
/ppnId/ {
ppnId=$4
#print NR,ppnId
next
}
/sVolume/ {
volume=$4
printf("# Volume %s\n",volume)
if (length(ppnId)==0) {
printf("wd add-claim %s P6721 %s\n",item,ppnToSet)
} else {
printf("# has ppnId %s\n",ppnId)
}
item=""
volume=""
ppnToSet=""
ppnId=""
next
}
'
A search via lobid in GND (http://lobid.org/gnd/search?q=ceur-ws.org) only returns a single result so linking more GND to CEUR is not needed. This issue is more about linking K10plus to CEUR.
@nichtich down to Volume 1806 things seem to be working fine. Then my script creates:
# Volume 1806
wd add-claim Q113544095 P6721 883741318
# Volume 1806
wd add-claim Q113544095 P6721 1654592021
and we end up with two k10 plus links: https://www.wikidata.org/wiki/Q113544095
which I'd love to avoid.
Things are fine again until Volume 855 and then i get quite a few duplicates:
# Volume 852
wd add-claim Q113545222 P6721 729443280
# Volume 852
wd add-claim Q113545222 P6721 729443671
# Volume 851
wd add-claim Q113545223 P6721 786779489
# Volume 850
wd add-claim Q113545224 P6721 724900071
# Volume 850
wd add-claim Q113545224 P6721 724900578
# Volume 850
wd add-claim Q113545224 P6721 731774108
# Volume 801
wd add-claim Q113545278 P6721 682988383
# Volume 750
wd add-claim Q113545335 P6721 780588444
# Volume 733
wd add-claim Q113545351 P6721 66936729X
# Volume 733
wd add-claim Q113545351 P6721 669367656
# Volume 723
wd add-claim Q113547325 P6721 724861106
# Volume 700
# has ppnId 647186519
# Volume 581
wd add-claim Q113545532 P6721 631965378
# Volume 581
wd add-claim Q113545532 P6721 632051620
# Volume 581
wd add-claim Q113545532 P6721 631965750
# Volume 481
# has ppnId 1741855497
# Volume 366
# has ppnId 1741843944
These seem to be actual duplicates in K10plus so passing the duplicates to Wikidata is fine IMHO (unless there are hundreds of it).
Installation catmandu https://librecat.org/Catmandu/
sudo apt install libcatmandu-perl
catmandu --version
catmandu (Catmandu::CLI) version 1.2016 (/usr/bin/catmandu)
picadata https://github.com/pro4bib/pica/blob/master/picadata.md
sudo apt-get install libcatmandu-sru-perl cpanminus
sudo cpanm Catmandu::PICA
Example: http://lobid.org/gnd/1250028426