Closed benearnthof closed 11 months ago
Update: I've resolved this problem by adjusting the function to directly use the newly generated token, which avoids the "illegal" error.
from pathlib import Path
from os import getcwd
from omegaconf import OmegaConf
from arxiv_public_data.oai_metadata import (
all_of_arxiv,
get_list_record_chunk,
check_xml_errors,
parse_xml_listrecords,
URL_ARXIV_OAI,
)
from warnings import warn
import requests
import xml.etree.ElementTree as ET
from arxiv_public_data.config import LOGGER, DIR_BASE
log = LOGGER.getChild('metadata')
cfg = Path(getcwd()) / "./sciencenow/config/secrets.yaml"
config = OmegaConf.load(cfg)
outfile = config.ARXIV_SNAPSHOT + ".gz"
tokenfile = '{}-resumptionToken.txt'.format(outfile)
if Path(tokenfile).exists():
old_token = open(tokenfile, 'r').read()
else:
warn("No resumption token found.")
def update_arxiv_snapshot(
old_token,
harvest_url=URL_ARXIV_OAI,
metadataPrefix='arXivRaw',
outfile=outfile):
"""
Queries the OIA API for the metadata of 1 chunk of 1000 articles to obtain a new resumption token,
Then uses the index of the old resumption token to continue the download of new chunks without having to
redownload the entire dataset from scratch.
Parameters
----------
old_token : str
expired resumptionToken
"""
parameters = {'verb': 'ListRecords'}
parameters['metadataPrefix'] = metadataPrefix
response = requests.get(harvest_url, params=parameters)
xml_root = ET.fromstring(response.text)
_, new_token = parse_xml_listrecords(xml_root)
index = old_token.split("|")[1]
new_token = new_token.split("|")[0] + "|" + index
resumptionToken = new_token
chunk_index = 0
total_records = 0
while True:
log.info('Index {:4d} | Records {:7d} | resumptionToken "{}"'.format(
chunk_index, total_records, resumptionToken)
)
xml_root = ET.fromstring(get_list_record_chunk(resumptionToken))
check_xml_errors(xml_root)
records, resumptionToken = parse_xml_listrecords(xml_root)
chunk_index = chunk_index + 1
total_records = total_records + len(records)
with gzip.open(outfile, 'at', encoding='utf-8') as fout:
for rec in records:
fout.write(json.dumps(rec) + '\n')
if resumptionToken:
with open(tokenfile, 'w') as fout:
fout.write(resumptionToken)
else:
log.info('No resumption token, query finished')
return
time.sleep(12) # OAI server usually requires a 10s wait
# old_token = "6854384|2363001"
update_arxiv_snapshot(old_token)
I'll write a pull request in the coming days once I've cleaned this up and tested everything thoroughly in the meantime I hope these code snippets prove useful for other readers.
Hello and thank you for this very useful repository!
I've used the
all_of_arxiv
function to successfully download the json snapshot from OAI last month and wanted to update the data now, so I tried to useall_of_arxiv
with the respective resumption token and correct file paths from the previous download. Unfortunately this returns the following error:RuntimeError: OAI service returned error: Error from database: Database error (noDataForResultSet - may have expired)
I tried hacking together a new resumption token with the resumption id from a new session, and the index of the old token like so:
With
outfile
being the file path to the json.gz file where the initial download was saved.Unfortunately this results in the error:
RuntimeError: OAI service returned error: The resumptionToken supplied is illegal for this repository
Is there a way to proceed with downloads after a longer time has passed and the resumptionToken has expired? I've looked through the other issues and http://www.openarchives.org/OAI/openarchivesprotocol.html#FlowControl but did not find a way to do this.