allenai / s2-folks

Public space for the user community of Semantic Scholar APIs to share scripts, report issues, and make suggestions.
Other
144 stars 25 forks source link

Q: api limit rate #201

Closed EricJoung1997 closed 3 weeks ago

EricJoung1997 commented 3 weeks ago

I'm using the https://api.semanticscholar.org/graph/v1/paper/search API to search for papers. I have already applied for an API key, which has a 100 req/sec transfer rate. However, my code is still encountering 429 errors (Too Many Requests). I am working with approximately 60,000 records. Below is my code:

import pandas as pd
import json
from tqdm import tqdm
import os
import asyncio
import aiohttp
from aiohttp import ClientSession
from aiohttp import ClientTimeout
import nest_asyncio

# Allow nested event loops
nest_asyncio.apply()

# Read API key from configuration file
with open('semantic_scholar_api_config.json') as config_file:
    config = json.load(config_file)
    semantic_scholar_api_key = config.get('semantic_scholar_api_key')

async def fetch_semantic_scholar_data(session, input_en_keywords):
    """
    Fetch paper data from the Semantic Scholar API.
    """
    url = "https://api.semanticscholar.org/graph/v1/paper/search/"
    params = {
        "query": input_en_keywords,
        "fields": "paperId,externalIds,title,year,authors,venue,publicationVenue,abstract,referenceCount,citationCount,fieldsOfStudy,s2FieldsOfStudy,journal,publicationTypes,publicationDate,openAccessPdf,references,citations",
        "limit": 1
    }
    headers = {
        "x-api-key": semantic_scholar_api_key,
        'Content-Type': 'application/json'
    }

    async with session.get(url, params=params, headers=headers) as response:
        if response.status == 200:
            response_data = await response.json()
            if response_data.get('data'):
                return response_data['data'][0]
            else:
                return None
        else:
            print({"error": f"Request failed with status code {response.status}"})
            return {"error": f"Request failed with status code {response.status}"}

async def process_title(session, row, title_col, label_col):
    """
    Process a single paper title to get detailed information.
    """
    title = row[title_col]
    label = row[label_col]
    wosId = row['UT (Unique ID)']
    result = await fetch_semantic_scholar_data(session, title)
    if result and "error" not in result:
        result['label'] = label
        result['wosId'] = wosId
        return result
    else:
        return {
            "title": title,
            "label": label,
            "wosId": wosId,
            "error": result.get("error", "Unknown error") if result else "No data returned"
        }

async def query_titles_and_save_to_json(df, title_col, label_col, output_file, temp_file='temp_results.json'):
    """
    Query all paper titles and save results to a JSON file.
    """
    # Try to read existing results from a temporary file
    try:
        with open(temp_file, 'r', encoding='utf-8') as f:
            results = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        results = []

    processed_titles = {result['title'] for result in results}
    remaining_data = len(df) - len(processed_titles)
    print(f'{remaining_data} records remaining')

    timeout = ClientTimeout(total=600)
    async with ClientSession(timeout=timeout) as session:
        tasks = []
        for index, row in df.iterrows():
            if row[title_col] not in processed_titles:
                tasks.append(process_title(session, row, title_col, label_col))
                if len(tasks) % 90 == 0:
                    # Execute tasks in batches of 90
                    for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing"):
                        try:
                            result = await future
                            results.append(result)
                            # Save results every 100 records
                            if len(results) % 100 == 0:
                                with open(temp_file, 'w', encoding='utf-8') as f:
                                    json.dump(results, f, ensure_ascii=False, indent=4)
                            # Output remaining data every 1000 records
                            if len(results) % 1000 == 0:
                                remaining_data -= 1000
                                print(f'{remaining_data} records remaining')
                        except Exception as e:
                            print(f"Error processing row: {e}")
                    tasks = []
                    await asyncio.sleep(1)  # Wait 1 second before continuing

        # Process remaining tasks
        if tasks:
            for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing"):
                try:
                    result = await future
                    results.append(result)
                    if len(results) % 100 == 0:
                        with open(temp_file, 'w', encoding='utf-8') as f:
                            json.dump(results, f, ensure_ascii=False, indent=4)
                    if len(results) % 1000 == 0:
                        remaining_data -= 1000
                        print(f'{remaining_data} records remaining')
                except Exception as e:
                    print(f"Error processing row: {e}")

    # Save final results to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    # Delete temporary file
    if os.path.exists(temp_file):
        os.remove(temp_file)

def main():
    print('Reading data...')
    df = pd.read_excel('wos.xlsx', sheet_name='Sheet1')

    # Filter criteria
    publication_year_range = (2000, 2024)
    document_types = [
        'Article', 'Article; Early Access', 'Article; Proceedings Paper',
        'Journal Article', 'Proceedings Paper', 'Review', 'research-article'
    ]

    # Filter data
    filtered_df = df[
        (df['Publication Year'] >= publication_year_range[0]) &
        (df['Publication Year'] <= publication_year_range[1]) &
        (df['Document Type'].isin(document_types))
    ]

    # Remove duplicates based on 'UT (Unique ID)'
    deduplicated_df = filtered_df.drop_duplicates(subset=['UT (Unique ID)'])

    print('Data filtering completed')

    # Run query and save results
    asyncio.run(query_titles_and_save_to_json(deduplicated_df.iloc[50000:60000], 'Article Title', 'label', 'wos_data_with_SS.json'))

if __name__ == "__main__":
    main()
cfiorelli commented 3 weeks ago

@EricJoung1997 thanks for reaching out. i've found your key and discovered the key has been moved to a 1RPS plan - this is due to our need to manage resources more carefully during 2024. Please adjust your code to handle its requests via 1RPS limits, and ensure to employ exponential backoff.