Closed tobiasschweizer closed 2 months ago
jq '[.[] | select(.[2][] | select(.[2] == "0000-0001-6118-4550")) | select(.[2][] | select(.[0] == "Gunter" and .[1] == "Bombaerts" and .[3] == null))] | map([.[0], .[1], (.[2] | map((if .[0] == "Gunter" and.[1] == "Bombaerts" then [.[0], .[1], "0000-0002-8006-1617", null, null] else . end)))])' results.json | less
import json
from typing import Dict, List, Any
import jq
from pid_resolver_lib import parse_resolved_dois_from_json, PublicationInfo, AuthorInfo
#def contains_unknown_author(authors: List[AuthorInfo]) -> bool:
# return len(list(filter(lambda auth: auth.orcid is None, authors))) > 0
def search_author(given_name: str, family_name: str, with_ctx: List[List[List[AuthorInfo | list[AuthorInfo] | str]]]):
return list(map(lambda pub: list(filter(lambda auth: auth[0][0] == given_name and auth[0][1] == family_name and auth[0][2] is not None, pub)), with_ctx))
# TODO: document return type
def make_context(pub: PublicationInfo):
length = len(pub.authors)
idx_range = list(range(length))
# structure:
# for each author of a publication, return an entry with the author's profile and his co-authors
return list(map(lambda idx: [pub.authors[idx], (pub.authors[:idx] + pub.authors[idx+1:]), pub.doi], idx_range))
results: Dict[str, PublicationInfo] = parse_resolved_dois_from_json('testset.json')
pubs: List[PublicationInfo] = list(results.values())
# preserve their context, i.e. their co-authors
with_context: list[list[list[AuthorInfo | list[AuthorInfo] | str]]] = list(map(make_context, pubs))
#print(json.dumps(with_context))
for pub in with_context:
# search for an author without ORCID
for auth_ctx in pub:
author = auth_ctx[0]
ctx = auth_ctx[1]
doi = auth_ctx[2]
if author[2] is None:
co_author_orcid = set(map(lambda co_auth: co_auth.orcid, ctx))
# search for an author with the same name but with an ORCID
orcid_matches = list(filter(lambda pub: len(pub) > 0, search_author(author[0], author[1], with_context)))
print(orcid_matches)
# TODO: iterate over all matches
orcid_matches_co_author = set(map(lambda match: match[2], orcid_matches[0][0][1]))
# check if there is an intersect in the context
common_co_authors = co_author_orcid.intersection(orcid_matches_co_author)
print(common_co_authors)
#print(author, ctx, doi)
#print(json.dumps(no_orcid))
# find authors without an ORCID
#no_orcid: List[PublicationInfo] = list(filter(lambda pub: contains_unknown_author(pub.authors), pubs))
# find authors with the same name that have an ORCID, sharing some of the same context
simplified version
from typing import Dict, List, Any, NamedTuple, Union
import json
from pid_resolver_lib import parse_resolved_dois_from_json, PublicationInfo, AuthorInfo
class ContextInfo(NamedTuple):
author: AuthorInfo
co_authors: List[AuthorInfo]
doi: str
idx: int
def make_context(pub: PublicationInfo) -> List[ContextInfo]:
length = len(pub.authors)
idx_range = list(range(length))
# structure:
# for each author of a publication, return an entry with the author's profile and his co-authors
return list(map(lambda idx: ContextInfo(pub.authors[idx], (pub.authors[:idx] + pub.authors[idx+1:]), pub.doi, idx), idx_range))
def search_author(given_name: str, family_name: str, with_ctx: List[ContextInfo]):
return list(filter(lambda ctx: ctx.author.given_name == given_name and ctx.author.family_name == family_name and ctx.author.orcid is not None, with_ctx))
results: Dict[str, PublicationInfo] = parse_resolved_dois_from_json('results.json')
pubs: List[PublicationInfo] = list(results.values())
#print(pubs)
# preserve their context, i.e. their co-authors
with_context: List[List[ContextInfo]] = list(map(make_context, pubs))
flattened_context: List[ContextInfo] = [item for sublist in with_context for item in sublist]
#print(flattened_context)
for auth_ctx in flattened_context:
if auth_ctx.author.orcid is None:
match = search_author(auth_ctx.author.given_name, auth_ctx.author.family_name, flattened_context)
if len(match) > 0:
# compare co-authors (ignore co-authors without ORCID)
co_author_orcid = set(map(lambda co_author: co_author.orcid, auth_ctx.co_authors)) - {None}
common_co_authors = co_author_orcid.intersection(set(map(lambda co_author: co_author.orcid, match[0].co_authors)))
#print(common_co_authors)
if len(common_co_authors) > 0:
# infer author's ORCID
print(f'{auth_ctx.author.given_name}, {auth_ctx.author.family_name}, {auth_ctx.author.orcid}, {auth_ctx.doi}, {auth_ctx.idx}, {match[0].author.orcid}, {match[0].author.given_name}, {match[0].author.family_name}, {match[0].doi}, {common_co_authors}')
# add missing ORCID
results[auth_ctx.doi] = PublicationInfo(
doi=results[auth_ctx.doi].doi,
title=results[auth_ctx.doi].title,
authors=results[auth_ctx.doi].authors[:auth_ctx.idx] + [AuthorInfo(given_name=auth_ctx.author.given_name, family_name=auth_ctx.author.family_name, orcid=match[0].author.orcid, origin_orcid='inferred', ror=None)] + results[auth_ctx.doi].authors[auth_ctx.idx+1:]
)
with open('updated.json', 'w') as f:
f.write(json.dumps(results))
The results of the analysis of resolved DOIs and ORCIDs is written to
results.json
. The "same" person can still occur with and without ORCID, depending on the available structured metadata at the level of DOI RAs and the completeness of a person's ORCID profile. Still, missing ORCIDs can be inferred by relying on co-authorship.Let's look at the following example:
In guess the rule could be something like this: