mattbierbaum / arxiv-public-datasets

A set of scripts to grab public datasets from resources related to arXiv
https://arxiv.org/abs/1905.00075
MIT License
409 stars 64 forks source link

How to resume downloads after resumption token has expired #26

Closed benearnthof closed 11 months ago

benearnthof commented 11 months ago

Hello and thank you for this very useful repository!

I've used the all_of_arxiv function to successfully download the json snapshot from OAI last month and wanted to update the data now, so I tried to use all_of_arxiv with the respective resumption token and correct file paths from the previous download. Unfortunately this returns the following error:
RuntimeError: OAI service returned error: Error from database: Database error (noDataForResultSet - may have expired)

I tried hacking together a new resumption token with the resumption id from a new session, and the index of the old token like so:

from pathlib import Path
from os import getcwd
from omegaconf import OmegaConf
from arxiv_public_data.oai_metadata import (
    all_of_arxiv, 
    get_list_record_chunk, 
    check_xml_errors, 
    parse_xml_listrecords,
    URL_ARXIV_OAI,
)
from warnings import warn
import requests
import xml.etree.ElementTree as ET

def refresh_resumption_token(
    old_token,
    harvest_url=URL_ARXIV_OAI,
    metadataPrefix='arXivRaw'):
    """
    Query OIA API for the metadata of 1 chunk of 1000 articles to obtain a new resumption token.
    Then use index of old resumption token to continue download of new chunks without having to 
    redownload the entire dataset

    Parameters
    ----------
        old_token : str
            expired resumptionToken

    Returns
    -------
        new_token : str
            fresh resumptionToken with index of old_token
    """
    parameters = {'verb': 'ListRecords'}
    parameters['metadataPrefix'] = metadataPrefix
    response = requests.get(harvest_url, params=parameters)
    xml_root = ET.fromstring(response.text)
    _, new_token = parse_xml_listrecords(xml_root)
    index = old_token.split("|")[1]
    new_token = new_token.split("|")[0] + "|" + index
    return new_token

old_token = "6854384|2363001"

new_token = refresh_resumption_token(old_token)

# because all_of_arxiv overwrites resumptionToken with None for some reason we need to store in
# a text file
with open(tokenfile, "w") as textfile:
    textfile.write(f"{new_token} \n")

all_of_arxiv(outfile=outfile, autoresume=True)

With outfile being the file path to the json.gz file where the initial download was saved.

Unfortunately this results in the error: RuntimeError: OAI service returned error: The resumptionToken supplied is illegal for this repository

Is there a way to proceed with downloads after a longer time has passed and the resumptionToken has expired? I've looked through the other issues and http://www.openarchives.org/OAI/openarchivesprotocol.html#FlowControl but did not find a way to do this.

benearnthof commented 11 months ago

Update: I've resolved this problem by adjusting the function to directly use the newly generated token, which avoids the "illegal" error.

from pathlib import Path
from os import getcwd
from omegaconf import OmegaConf
from arxiv_public_data.oai_metadata import (
    all_of_arxiv, 
    get_list_record_chunk, 
    check_xml_errors, 
    parse_xml_listrecords,
    URL_ARXIV_OAI,
)
from warnings import warn
import requests
import xml.etree.ElementTree as ET

from arxiv_public_data.config import LOGGER, DIR_BASE

log = LOGGER.getChild('metadata')

cfg = Path(getcwd()) / "./sciencenow/config/secrets.yaml"
config = OmegaConf.load(cfg)

outfile = config.ARXIV_SNAPSHOT + ".gz"

tokenfile = '{}-resumptionToken.txt'.format(outfile)

if Path(tokenfile).exists():
    old_token = open(tokenfile, 'r').read()
else:
    warn("No resumption token found.")

def update_arxiv_snapshot(
    old_token,
    harvest_url=URL_ARXIV_OAI,
    metadataPrefix='arXivRaw',
    outfile=outfile):
    """
    Queries the OIA API for the metadata of 1 chunk of 1000 articles to obtain a new resumption token,
    Then uses the index of the old resumption token to continue the download of new chunks without having to 
    redownload the entire dataset from scratch.

    Parameters
    ----------
        old_token : str
            expired resumptionToken
    """
    parameters = {'verb': 'ListRecords'}
    parameters['metadataPrefix'] = metadataPrefix
    response = requests.get(harvest_url, params=parameters)
    xml_root = ET.fromstring(response.text)
    _, new_token = parse_xml_listrecords(xml_root)
    index = old_token.split("|")[1]
    new_token = new_token.split("|")[0] + "|" + index
    resumptionToken = new_token
    chunk_index = 0
    total_records = 0
    while True:
        log.info('Index {:4d} | Records {:7d} | resumptionToken "{}"'.format(
            chunk_index, total_records, resumptionToken)
        )
        xml_root = ET.fromstring(get_list_record_chunk(resumptionToken))
        check_xml_errors(xml_root)
        records, resumptionToken = parse_xml_listrecords(xml_root)
        chunk_index = chunk_index + 1
        total_records = total_records + len(records)
        with gzip.open(outfile, 'at', encoding='utf-8') as fout:
            for rec in records:
                fout.write(json.dumps(rec) + '\n')
        if resumptionToken:
            with open(tokenfile, 'w') as fout:
                fout.write(resumptionToken)
        else:
            log.info('No resumption token, query finished')
            return
        time.sleep(12)  # OAI server usually requires a 10s wait

# old_token = "6854384|2363001"

update_arxiv_snapshot(old_token)

I'll write a pull request in the coming days once I've cleaned this up and tested everything thoroughly in the meantime I hope these code snippets prove useful for other readers.