cldf / cldfzenodo

Access CLDF datasets archived with ZENODO
Apache License 2.0
2 stars 0 forks source link

Datasets to BibTeX #2

Closed xrotwang closed 3 years ago

xrotwang commented 5 years ago

Something along these lines:

from collections import defaultdict
from xml.etree import ElementTree
import re
import io

import requests
from clldutils.source import Source

URL = "https://zenodo.org/oai2d?verb=ListRecords&set=user-clics&metadataPrefix=oai_dc
"
REPOS_PATTERN = re.compile('https://github.com/(?P<repos>[^/]+/[^/]+)/tree/(?P<versio
n>.+)')
REQ_PATTERN = re.compile('https://github.com/(?P<repos>[^/]+/[^.]+).git@(?P<version>[
^#]+)#egg=')

def record(rec):
    res = {}
    for term in ['creator', 'date', 'description', 'identifier', 'relation', 'title']
:
        res[term] = [e.text for e in rec.findall('.//{http://purl.org/dc/elements/1.1
/}' + term)]
    rel = defaultdict(list)
    for r in res['relation']:
        type_, _, id_ = r.partition(':')
        rel[type_].append(id_)
    res['relation'] = rel
    res['doi'] = None
    for id_ in res['identifier']:
        if id_.startswith('10.5281'):
            res['doi'] = id_
            break
    return res

def main(datasets):
    on_zenodo = {}
    res = ElementTree.fromstring(requests.get(URL).text)
    for rec in res.findall('.//{http://www.openarchives.org/OAI/2.0/}record'):
        rec = record(rec)
        repo = None
        for rel in rec['relation']['url']:
            m = REPOS_PATTERN.match(rel)
            if m:
                on_zenodo[(m.group('repos'), m.group('version'))] = rec

    i = 0
    for line in io.open(datasets):
        m = REQ_PATTERN.search(line)
        if m:
            r, v = m.group('repos'), m.group('version')
            rec = on_zenodo.get((r, v))
            if rec:
                src = Source(
                    'misc', 
                    r.split('/')[-1], 
                    title=rec['title'][0],
                    doi=rec['doi'],
                    author=' and '.join(rec['creator']),
                    note=rec['description'][0],
                    howpublished='lexibank dataset on ZENODO',
                )
                print(src.bibtex())
            i += 1
    print(i)

if __name__ == "__main__":
    main('clics3/datasets.txt')
xrotwang commented 3 years ago

doi2bib is probably the better option:

$ curl https://doi2bib.org/2/doi2bib?id=10.5281/zenodo.4762034
@misc{https://doi.org/10.5281/zenodo.4762034,
  doi = {10.5281/ZENODO.4762034},
  url = {https://zenodo.org/record/4762034},
  author = {Hammarström, Harald and Forkel, Robert and Haspelmath, Martin and Bank, Sebastian},
  keywords = {cldf:StructureDataset, linguistics},
  title = {glottolog/glottolog: Glottolog database 4.4 as CLDF},
  publisher = {Zenodo},
  year = {2021},
  copyright = {Creative Commons Attribution 4.0 International}
}

Whereas Zenodo's bibtex export has:

@dataset{harald_hammarstrom_2021_4762034,
  author       = {Harald Hammarström and
                  Robert Forkel and
                  Martin Haspelmath and
                  Sebastian Bank},
  title        = {{glottolog/glottolog: Glottolog database 4.4 as 
                   CLDF}},
  month        = may,
  year         = 2021,
  publisher    = {Zenodo},
  version      = {v4.4},
  doi          = {10.5281/zenodo.4762034},
  url          = {https://doi.org/10.5281/zenodo.4762034}
}