Open wdp-007 opened 11 months ago
This is not a minimal code, but I hope it helps.
import boto3
# Initialize a session using Amazon S3
session = boto3.session.Session(
aws_access_key_id='XXX',
aws_secret_access_key='YYY',
)
# Create an S3 client
s3 = session.client('s3')
# Define the bucket name and the key of the manifest file
bucket_name = 'arxiv'
def download_arxiv_file(origin_path, target_path=None, verbose=True):
if target_path is None:
target_path = origin_path
target_path = './downloaded/' + target_path
# Ensure the directory exists
directory = os.path.dirname(target_path)
if not os.path.exists(directory):
os.makedirs(directory)
if os.path.exists(target_path):
if verbose:
print(f'🐤 {origin_path} already downloaded to {target_path}')
return
try:
s3.download_file(bucket_name, origin_path, target_path, ExtraArgs={'RequestPayer': 'requester'})
if verbose:
print(f'🥳 Download ({origin_path}) successfully.')
except Exception as e:
print(f"❗️ An error occurred: {e}")
download_arxiv_file('pdf/arXiv_pdf_manifest.xml')
download_arxiv_file('src/arXiv_src_manifest.xml')
This is not a minimal code, but I hope it helps.
import boto3 # Initialize a session using Amazon S3 session = boto3.session.Session( aws_access_key_id='XXX', aws_secret_access_key='YYY', ) # Create an S3 client s3 = session.client('s3') # Define the bucket name and the key of the manifest file bucket_name = 'arxiv'
def download_arxiv_file(origin_path, target_path=None, verbose=True): if target_path is None: target_path = origin_path target_path = './downloaded/' + target_path # Ensure the directory exists directory = os.path.dirname(target_path) if not os.path.exists(directory): os.makedirs(directory) if os.path.exists(target_path): if verbose: print(f'🐤 {origin_path} already downloaded to {target_path}') return try: s3.download_file(bucket_name, origin_path, target_path, ExtraArgs={'RequestPayer': 'requester'}) if verbose: print(f'🥳 Download ({origin_path}) successfully.') except Exception as e: print(f"❗️ An error occurred: {e}")
download_arxiv_file('pdf/arXiv_pdf_manifest.xml') download_arxiv_file('src/arXiv_src_manifest.xml')
@lilingxi01 Thank you very much for this very useful and clear code snippet !
I'm trying to find information about the price it would cost to download part or all of the bucket, but I can't find information about costs for Requester Pays buckets in the AWS documentation (https://docs.aws.amazon.com/AmazonS3/latest/userguide/RequesterPaysBuckets.html) - do you have an idea ?
@OrianeN It should be the same bandwidth cost as when you are hosting S3 bucket. You can check the S3 pricing page. There is no dedicated Requester Pays pricing page because they are the same. The only difference is who pays the fee.
Hope it helps.
Thanks for your good work! Are you planning releasing the datasets?Or provide the scripts to download arxiv files?