I've tested this script, and I believe it should work to get the Manubot responses into the format I need:
python script
```python
"""
adapted from: https://github.com/greenelab/lab-website-template/blob/main/_cite/util.py
"""
import subprocess
import json
def cite_with_manubot(ids):
"""
generate full citations from ids with Manubot
"""
# output list of full citation details
citations = []
for _id in ids:
# new citation
citation = {}
# original id
citation["id"] = _id
# run Manubot
try:
commands = ["manubot", "cite", _id, "--log-level=WARNING"]
output = subprocess.Popen(commands, stdout=subprocess.PIPE).communicate()
except Exception as e:
print("WARNING: Manubot could not generate citation")
print(e)
citations.append(citation)
continue
# parse results as json
try:
manubot = json.loads(output[0])[0]
except Exception as e:
print("WARNING: Couldn't parse Manubot response")
print(e)
citations.append(citation)
continue
# title
citation["title"] = get_safe(manubot, "title").strip()
# type
citation["type"] = get_safe(manubot, "type").strip()
# doi
citation["doi"] = get_safe(manubot, "DOI").strip()
# authors
citation["authors"] = []
for author in get_safe(manubot, "author", {}):
given = get_safe(author, "given").strip()
family = get_safe(author, "family").strip()
if given or family:
citation["authors"].append([given, family])
# publisher
container = get_safe(manubot, "container-title").strip()
collection = get_safe(manubot, "collection-title").strip()
publisher = get_safe(manubot, "publisher").strip()
source = get_safe(manubot, "source").strip()
citation["publisher"] = container or publisher or collection or source or ""
citation["issn"] = get_safe(manubot, "ISSN")
# dates
# citation["accessed_date"] = get_date(get_safe(manubot, "accessed", {}))
citation["issued_date"] = get_date(get_safe(manubot, "issued", {}))
# link
citation["link"] = get_safe(manubot, "URL").strip()
# abstract
citation["abstract"] = get_safe(manubot, "abstract").strip()
# language
citation["language"] = get_safe(manubot, "language", "en").strip()
# note
citation["note"] = get_safe(manubot, "note").strip()
# add citation to list
citations.append(citation)
return citations
# get YYYY-MM-DD date string from wonky Manubot split-date format
def get_date(date_parts):
# extract date part
def get_part(date_parts, index):
try:
return int(date_parts["date-parts"][0][index])
except Exception:
return 0
# date
year = get_part(date_parts, 0)
if year:
# fallbacks for month and day
month = get_part(date_parts, 1) or "01"
day = get_part(date_parts, 2) or "01"
return f"{year:04}-{month:02}-{day:02}"
else:
# if no year, consider date missing data
return ""
def get_safe(item, path, default=""):
"""
safely access value in nested lists/dicts
"""
for part in str(path).split("."):
try:
part = int(part)
except ValueError:
part = part
try:
item = item[part]
except (KeyError, IndexError, AttributeError, TypeError):
return default
return item
test = ["doi:10.1101/2023.10.09.23296582", "pmid:11246464"]
print(json.dumps(cite_with_manubot(test), indent=4))
```
I.e., from something like...
this (raw Manubot output)
```json
{
"publisher": "Cold Spring Harbor Laboratory",
"abstract": "ABSTRACTIndividuals affected by inherited neuromuscular diseases often present with a specific pattern of muscle weakness, which can guide clinicians in genetic investigations and variant interpretation. Nonetheless, more than 50% of cases do not receive a genetic diagnosis. Oculopharyngodistal myopathy (OPDM) is an inherited myopathy manifesting with a particular combination of ptosis, dysphagia and distal weakness. Pathologically it is characterised by rimmed vacuoles and intranuclear inclusions on muscle biopsy. In recent years GCC \u2022 CCG repeat expansion in four different genes have been identified in individuals affected by OPDM in Asian populations. None of these have been identified in affected individuals of non-Asian ancestry.In this study we describe the identification of CCG expansions inABCD3in affected individuals across eight unrelated OPDM families of European ancestry. In two large Australian OPDM families, using a combination of linkage studies, short-read WGS and targeted ONT sequencing, we identified CCG expansions in the 5\u2019UTR ofABCD3. Independently, theABCD3CCG expansion was identified through the 100,000 Genomics England Genome Project in three individuals from two unrelated UK families diagnosed with OPDM. Targeted ONT sequencing confirmed the presence of mono-allelic CCG repeat expansions ranging from 118 to 694 repeats in all tested cases (n=19). The expansions were on average 1.9 times longer in affected females than affected males, and children of affected males were \u223c2.3 times more likely to have the disease than those of affected females, suggesting inheritance of an expanded allele from an affected mother may have reduced penetrance.ABCD3transcripts appeared upregulated in skeletal muscle and cells derived from affected OPDM individuals, suggesting a potential role of over-expression of CCG repeat containingABCD3transcript in progressive skeletal muscle degeneration. The study provides further evidence of the role of non-coding repeat expansions in unsolved neuromuscular diseases and strengthens the association between the GCC \u2022 CCG repeat motif and a specific pattern of muscle weakness with prominent cranial involvement across different populations.",
"DOI": "10.1101/2023.10.09.23296582",
"type": "manuscript",
"source": "Crossref",
"title": "A CCG expansion inABCD3causes oculopharyngodistal myopathy in individuals of European ancestry",
"author": [
{
"given": "Andrea",
"family": "Cortese"
},
{
"given": "Sarah J",
"family": "Beecroft"
},
{
"given": "Stefano",
"family": "Facchini"
},
],
"issued": {
"date-parts": [
[
2023,
10,
10
]
]
},
"URL": "https://doi.org/g8r5w2",
"id": "CofylC62",
"note": "This CSL Item was generated by Manubot v0.6.1 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2023.10.09.23296582"
}
```
...to something like...
this (cleaned data, ready to be integrated into website)
```json
{
"id": "doi:10.1101/2023.10.09.23296582",
"title": "A CCG expansion inABCD3causes oculopharyngodistal myopathy in individuals of European ancestry",
"type": "manuscript",
"doi": "10.1101/2023.10.09.23296582",
"authors": [
["Andrea", "Cortese"],
["Sarah J", "Beecroft"],
["Stefano", "Facchini"],
],
"publisher": "Cold Spring Harbor Laboratory",
"issn": "",
"issued_date": "2023-10-10",
"link": "https://doi.org/g8r5w2",
"abstract": "ABSTRACTIndividuals affected by inherited neuromuscular diseases often present with a specific pattern of muscle weakness, which can guide clinicians in genetic investigations and variant interpretation. Nonetheless, more than 50% of cases do not receive a genetic diagnosis. Oculopharyngodistal myopathy (OPDM) is an inherited myopathy manifesting with a particular combination of ptosis, dysphagia and distal weakness. Pathologically it is characterised by rimmed vacuoles and intranuclear inclusions on muscle biopsy. In recent years GCC \u2022 CCG repeat expansion in four different genes have been identified in individuals affected by OPDM in Asian populations. None of these have been identified in affected individuals of non-Asian ancestry.In this study we describe the identification of CCG expansions inABCD3in affected individuals across eight unrelated OPDM families of European ancestry. In two large Australian OPDM families, using a combination of linkage studies, short-read WGS and targeted ONT sequencing, we identified CCG expansions in the 5\u2019UTR ofABCD3. Independently, theABCD3CCG expansion was identified through the 100,000 Genomics England Genome Project in three individuals from two unrelated UK families diagnosed with OPDM. Targeted ONT sequencing confirmed the presence of mono-allelic CCG repeat expansions ranging from 118 to 694 repeats in all tested cases (n=19). The expansions were on average 1.9 times longer in affected females than affected males, and children of affected males were \u223c2.3 times more likely to have the disease than those of affected females, suggesting inheritance of an expanded allele from an affected mother may have reduced penetrance.ABCD3transcripts appeared upregulated in skeletal muscle and cells derived from affected OPDM individuals, suggesting a potential role of over-expression of CCG repeat containingABCD3transcript in progressive skeletal muscle degeneration. The study provides further evidence of the role of non-coding repeat expansions in unsolved neuromuscular diseases and strengthens the association between the GCC \u2022 CCG repeat motif and a specific pattern of muscle weakness with prominent cranial involvement across different populations.",
"language": "en",
"note": "This CSL Item was generated by Manubot v0.6.1 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2023.10.09.23296582"
}
```
I've tested this script, and I believe it should work to get the Manubot responses into the format I need:
python script
```python """ adapted from: https://github.com/greenelab/lab-website-template/blob/main/_cite/util.py """ import subprocess import json def cite_with_manubot(ids): """ generate full citations from ids with Manubot """ # output list of full citation details citations = [] for _id in ids: # new citation citation = {} # original id citation["id"] = _id # run Manubot try: commands = ["manubot", "cite", _id, "--log-level=WARNING"] output = subprocess.Popen(commands, stdout=subprocess.PIPE).communicate() except Exception as e: print("WARNING: Manubot could not generate citation") print(e) citations.append(citation) continue # parse results as json try: manubot = json.loads(output[0])[0] except Exception as e: print("WARNING: Couldn't parse Manubot response") print(e) citations.append(citation) continue # title citation["title"] = get_safe(manubot, "title").strip() # type citation["type"] = get_safe(manubot, "type").strip() # doi citation["doi"] = get_safe(manubot, "DOI").strip() # authors citation["authors"] = [] for author in get_safe(manubot, "author", {}): given = get_safe(author, "given").strip() family = get_safe(author, "family").strip() if given or family: citation["authors"].append([given, family]) # publisher container = get_safe(manubot, "container-title").strip() collection = get_safe(manubot, "collection-title").strip() publisher = get_safe(manubot, "publisher").strip() source = get_safe(manubot, "source").strip() citation["publisher"] = container or publisher or collection or source or "" citation["issn"] = get_safe(manubot, "ISSN") # dates # citation["accessed_date"] = get_date(get_safe(manubot, "accessed", {})) citation["issued_date"] = get_date(get_safe(manubot, "issued", {})) # link citation["link"] = get_safe(manubot, "URL").strip() # abstract citation["abstract"] = get_safe(manubot, "abstract").strip() # language citation["language"] = get_safe(manubot, "language", "en").strip() # note citation["note"] = get_safe(manubot, "note").strip() # add citation to list citations.append(citation) return citations # get YYYY-MM-DD date string from wonky Manubot split-date format def get_date(date_parts): # extract date part def get_part(date_parts, index): try: return int(date_parts["date-parts"][0][index]) except Exception: return 0 # date year = get_part(date_parts, 0) if year: # fallbacks for month and day month = get_part(date_parts, 1) or "01" day = get_part(date_parts, 2) or "01" return f"{year:04}-{month:02}-{day:02}" else: # if no year, consider date missing data return "" def get_safe(item, path, default=""): """ safely access value in nested lists/dicts """ for part in str(path).split("."): try: part = int(part) except ValueError: part = part try: item = item[part] except (KeyError, IndexError, AttributeError, TypeError): return default return item test = ["doi:10.1101/2023.10.09.23296582", "pmid:11246464"] print(json.dumps(cite_with_manubot(test), indent=4)) ```I.e., from something like...
this (raw Manubot output)
```json { "publisher": "Cold Spring Harbor Laboratory", "abstract": "...to something like...
this (cleaned data, ready to be integrated into website)
```json { "id": "doi:10.1101/2023.10.09.23296582", "title": "A CCG expansion inABCD3causes oculopharyngodistal myopathy in individuals of European ancestry", "type": "manuscript", "doi": "10.1101/2023.10.09.23296582", "authors": [ ["Andrea", "Cortese"], ["Sarah J", "Beecroft"], ["Stefano", "Facchini"], ], "publisher": "Cold Spring Harbor Laboratory", "issn": "", "issued_date": "2023-10-10", "link": "https://doi.org/g8r5w2", "abstract": "