How to resume downloads after resumption token has expired

mattbierbaum / arxiv-public-datasets

A set of scripts to grab public datasets from resources related to arXiv

MIT License

409 stars 64 forks source link

Hello and thank you for this very useful repository!

I've used the all_of_arxiv function to successfully download the json snapshot from OAI last month and wanted to update the data now, so I tried to use all_of_arxiv with the respective resumption token and correct file paths from the previous download. Unfortunately this returns the following error:
RuntimeError: OAI service returned error: Error from database: Database error (noDataForResultSet - may have expired)

I tried hacking together a new resumption token with the resumption id from a new session, and the index of the old token like so:

from pathlib import Path
from os import getcwd
from omegaconf import OmegaConf
from arxiv_public_data.oai_metadata import (
    all_of_arxiv, 
    get_list_record_chunk, 
    check_xml_errors, 
    parse_xml_listrecords,
    URL_ARXIV_OAI,
)
from warnings import warn
import requests
import xml.etree.ElementTree as ET

def refresh_resumption_token(
    old_token,
    harvest_url=URL_ARXIV_OAI,
    metadataPrefix='arXivRaw'):
    """
    Query OIA API for the metadata of 1 chunk of 1000 articles to obtain a new resumption token.
    Then use index of old resumption token to continue download of new chunks without having to 
    redownload the entire dataset

    Parameters
    ----------
        old_token : str
            expired resumptionToken

    Returns
    -------
        new_token : str
            fresh resumptionToken with index of old_token
    """
    parameters = {'verb': 'ListRecords'}
    parameters['metadataPrefix'] = metadataPrefix
    response = requests.get(harvest_url, params=parameters)
    xml_root = ET.fromstring(response.text)
    _, new_token = parse_xml_listrecords(xml_root)
    index = old_token.split("|")[1]
    new_token = new_token.split("|")[0] + "|" + index
    return new_token

old_token = "6854384|2363001"

new_token = refresh_resumption_token(old_token)

# because all_of_arxiv overwrites resumptionToken with None for some reason we need to store in
# a text file
with open(tokenfile, "w") as textfile:
    textfile.write(f"{new_token} \n")

all_of_arxiv(outfile=outfile, autoresume=True)

With outfile being the file path to the json.gz file where the initial download was saved.

Unfortunately this results in the error: RuntimeError: OAI service returned error: The resumptionToken supplied is illegal for this repository

Is there a way to proceed with downloads after a longer time has passed and the resumptionToken has expired? I've looked through the other issues and http://www.openarchives.org/OAI/openarchivesprotocol.html#FlowControl but did not find a way to do this.

from pathlib import Path from os import getcwd from omegaconf import OmegaConf from arxiv_public_data.oai_metadata import ( all_of_arxiv, get_list_record_chunk, check_xml_errors, parse_xml_listrecords, URL_ARXIV_OAI, ) from warnings import warn import requests import xml.etree.ElementTree as ET from arxiv_public_data.config import LOGGER, DIR_BASE log = LOGGER.getChild('metadata') cfg = Path(getcwd()) / "./sciencenow/config/secrets.yaml" config = OmegaConf.load(cfg) outfile = config.ARXIV_SNAPSHOT + ".gz" tokenfile = '{}-resumptionToken.txt'.format(outfile) if Path(tokenfile).exists(): old_token = open(tokenfile, 'r').read() else: warn("No resumption token found.") def update_arxiv_snapshot( old_token, harvest_url=URL_ARXIV_OAI, metadataPrefix='arXivRaw', outfile=outfile): """ Queries the OIA API for the metadata of 1 chunk of 1000 articles to obtain a new resumption token, Then uses the index of the old resumption token to continue the download of new chunks without having to redownload the entire dataset from scratch. Parameters ---------- old_token : str expired resumptionToken """ parameters = {'verb': 'ListRecords'} parameters['metadataPrefix'] = metadataPrefix response = requests.get(harvest_url, params=parameters) xml_root = ET.fromstring(response.text) _, new_token = parse_xml_listrecords(xml_root) index = old_token.split("|")[1] new_token = new_token.split("|")[0] + "|" + index resumptionToken = new_token chunk_index = 0 total_records = 0 while True: log.info('Index {:4d} | Records {:7d} | resumptionToken "{}"'.format( chunk_index, total_records, resumptionToken) ) xml_root = ET.fromstring(get_list_record_chunk(resumptionToken)) check_xml_errors(xml_root) records, resumptionToken = parse_xml_listrecords(xml_root) chunk_index = chunk_index + 1 total_records = total_records + len(records) with gzip.open(outfile, 'at', encoding='utf-8') as fout: for rec in records: fout.write(json.dumps(rec) + '\n') if resumptionToken: with open(tokenfile, 'w') as fout: fout.write(resumptionToken) else: log.info('No resumption token, query finished') return time.sleep(12) # OAI server usually requires a 10s wait # old_token = "6854384|2363001" update_arxiv_snapshot(old_token)

mattbierbaum / arxiv-public-datasets

How to resume downloads after resumption token has expired #26