Connectome-Implementation-Team / pid_resolver

Apache License 2.0
1 stars 0 forks source link

Infer Missing ORCIDs From Co-Authorship #10

Closed tobiasschweizer closed 2 months ago

tobiasschweizer commented 3 months ago

The results of the analysis of resolved DOIs and ORCIDs is written to results.json. The "same" person can still occur with and without ORCID, depending on the available structured metadata at the level of DOI RAs and the completeness of a person's ORCID profile. Still, missing ORCIDs can be inferred by relying on co-authorship.

Let's look at the following example:

  [
    "10.1016/j.fusengdes.2007.07.016",
    "Comparison and analysis of expert and student views on the use of energy scenarios in communication on fusion research",
    [
      [
        "Gunter",
        "Bombaerts",
        null, -> no ORCID given, but can be inferred
        null,
        null
      ],
      [
        "Erik",
        "Laes",
        "0000-0001-6118-4550", -> same co-author
        "orcid",
        null
      ]
    ]
  ]
[
    "10.1016/j.erss.2023.103244",
    "Structuring values and normative frameworks using Schwartz's value theory to map the three tenets of energy justice",
    [
      [
        "Andreas",
        "Spahn",
        "0000-0003-2796-0776",
        "doi",
        null
      ],
      [
        "Gunter",
        "Bombaerts",
        "0000-0002-8006-1617", -> ORCID given
        "doi",
        null
      ],
      [
        "Erik",
        "Laes",
        "0000-0001-6118-4550", -> same co-author
        "doi",
        null
      ]
    ]
  ]

In guess the rule could be something like this:

Search for person profiles without ORCID that match with the name of a person profile with ORCID sharing one or more more co-authors identified by an ORCID.

tobiasschweizer commented 3 months ago

jq '[.[] | select(.[2][] | select(.[2] == "0000-0001-6118-4550")) | select(.[2][] | select(.[0] == "Gunter" and .[1] == "Bombaerts" and .[3] == null))] | map([.[0], .[1], (.[2] | map((if .[0] == "Gunter" and.[1] == "Bombaerts" then [.[0], .[1], "0000-0002-8006-1617", null, null] else . end)))])' results.json | less

tobiasschweizer commented 3 months ago

testset.json

import json
from typing import Dict, List, Any
import jq
from pid_resolver_lib import parse_resolved_dois_from_json, PublicationInfo, AuthorInfo

#def contains_unknown_author(authors: List[AuthorInfo]) -> bool:
#    return len(list(filter(lambda auth: auth.orcid is None, authors))) > 0

def search_author(given_name: str, family_name: str, with_ctx: List[List[List[AuthorInfo | list[AuthorInfo] | str]]]):
    return list(map(lambda pub: list(filter(lambda auth: auth[0][0] == given_name and auth[0][1] == family_name and auth[0][2] is not None, pub)), with_ctx))

# TODO: document return type
def make_context(pub: PublicationInfo):
    length = len(pub.authors)
    idx_range = list(range(length))

    # structure:

    # for each author of a publication, return an entry with the author's profile and his co-authors
    return list(map(lambda idx: [pub.authors[idx], (pub.authors[:idx] + pub.authors[idx+1:]), pub.doi], idx_range))

results: Dict[str, PublicationInfo] = parse_resolved_dois_from_json('testset.json')

pubs: List[PublicationInfo] = list(results.values())

# preserve their context, i.e. their co-authors
with_context: list[list[list[AuthorInfo | list[AuthorInfo] | str]]] = list(map(make_context, pubs))

#print(json.dumps(with_context))

for pub in with_context:
    # search for an author without ORCID
    for auth_ctx in pub:
        author = auth_ctx[0]
        ctx = auth_ctx[1]
        doi = auth_ctx[2]

        if author[2] is None:
            co_author_orcid = set(map(lambda co_auth: co_auth.orcid, ctx))

            # search for an author with the same name but with an ORCID
            orcid_matches = list(filter(lambda pub: len(pub) > 0, search_author(author[0], author[1], with_context)))

            print(orcid_matches)

            # TODO: iterate over all matches
            orcid_matches_co_author = set(map(lambda match: match[2], orcid_matches[0][0][1]))

            # check if there is an intersect in the context
            common_co_authors = co_author_orcid.intersection(orcid_matches_co_author)
            print(common_co_authors)

        #print(author, ctx, doi)

#print(json.dumps(no_orcid))

# find authors without an ORCID
#no_orcid: List[PublicationInfo] = list(filter(lambda pub: contains_unknown_author(pub.authors), pubs))

# find authors with the same name that have an ORCID, sharing some of the same context

context.json

tobiasschweizer commented 2 months ago

simplified version

from typing import Dict, List, Any, NamedTuple, Union
import json
from pid_resolver_lib import parse_resolved_dois_from_json, PublicationInfo, AuthorInfo

class ContextInfo(NamedTuple):
    author: AuthorInfo
    co_authors: List[AuthorInfo]
    doi: str
    idx: int

def make_context(pub: PublicationInfo) -> List[ContextInfo]:
    length = len(pub.authors)
    idx_range = list(range(length))

    # structure:

    # for each author of a publication, return an entry with the author's profile and his co-authors
    return list(map(lambda idx: ContextInfo(pub.authors[idx], (pub.authors[:idx] + pub.authors[idx+1:]), pub.doi, idx), idx_range))

def search_author(given_name: str, family_name: str, with_ctx: List[ContextInfo]):
    return list(filter(lambda ctx: ctx.author.given_name == given_name and ctx.author.family_name == family_name and ctx.author.orcid is not None, with_ctx))

results: Dict[str, PublicationInfo] = parse_resolved_dois_from_json('results.json')

pubs: List[PublicationInfo] = list(results.values())

#print(pubs)

# preserve their context, i.e. their co-authors
with_context: List[List[ContextInfo]] = list(map(make_context, pubs))

flattened_context: List[ContextInfo] = [item for sublist in with_context for item in sublist]

#print(flattened_context)

for auth_ctx in flattened_context:

    if auth_ctx.author.orcid is None:
        match = search_author(auth_ctx.author.given_name, auth_ctx.author.family_name, flattened_context)

        if len(match) > 0:
            # compare co-authors (ignore co-authors without ORCID)
            co_author_orcid = set(map(lambda co_author: co_author.orcid, auth_ctx.co_authors)) - {None}

            common_co_authors = co_author_orcid.intersection(set(map(lambda co_author: co_author.orcid, match[0].co_authors)))
            #print(common_co_authors)

            if len(common_co_authors) > 0:
                # infer author's ORCID
                print(f'{auth_ctx.author.given_name}, {auth_ctx.author.family_name}, {auth_ctx.author.orcid}, {auth_ctx.doi}, {auth_ctx.idx}, {match[0].author.orcid}, {match[0].author.given_name}, {match[0].author.family_name}, {match[0].doi}, {common_co_authors}')
                # add missing ORCID
                results[auth_ctx.doi] = PublicationInfo(
                    doi=results[auth_ctx.doi].doi,
                    title=results[auth_ctx.doi].title,
                    authors=results[auth_ctx.doi].authors[:auth_ctx.idx] + [AuthorInfo(given_name=auth_ctx.author.given_name, family_name=auth_ctx.author.family_name, orcid=match[0].author.orcid, origin_orcid='inferred', ror=None)] + results[auth_ctx.doi].authors[auth_ctx.idx+1:]
                )

with open('updated.json', 'w') as f:
     f.write(json.dumps(results))