mosaicml / streaming

A Data Streaming Library for Efficient Neural Network Training
https://streaming.docs.mosaicml.com
Apache License 2.0
1.09k stars 136 forks source link

Using minio with StreamingDataset #676

Closed abhijithneilabraham closed 4 months ago

abhijithneilabraham commented 4 months ago

I have been creating an s3 object with the help of minio in localhost like this:

import os
import boto3
from streaming import StreamingDataset
from torch.utils.data import DataLoader

# Configure the boto3 client to interact with Minio
s3_client = boto3.client(
    's3',
    endpoint_url=os.environ['AWS_S3_ENDPOINT_URL'],
    aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
    aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
    region_name=os.environ['AWS_REGION'],
    use_ssl=os.environ['S3_USE_HTTPS'] == '1',
    verify=os.environ['S3_VERIFY_SSL'] == '1',
    config=boto3.session.Config(signature_version='s3v4')
)

bucket_name = 'dudetest'
train_object_name = 'train_dataset'
eval_object_name = 'test_dataset'

remote_url_train = f's3://{bucket_name}/{train_object_name}'
remote_url_test = f's3://{bucket_name}/{eval_object_name}'

# Log URLs
print(f"Remote URL for training data: {remote_url_train}")
print(f"Remote URL for testing data: {remote_url_test}")

# Create streaming datasets
train_dataset = StreamingDataset(
    remote=remote_url_train,
    local='./local_cache/train',
    shuffle=True,
    batch_size=64,
)

eval_dataset = StreamingDataset(
    remote=remote_url_test,
    local='./local_cache/eval',
    shuffle=False,
    batch_size=64,
)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=64, shuffle=False)

I checked if the files are correctly present, and tried to provide the local minio s3 URL to StreamingDataset, but encountered an error:

 line 87, in <module>
    train_dataset = StreamingDataset(
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/dataset.py", line 473, in __init__
    stream_shards = stream.get_shards(self._unique_rank_world, self.allow_unsafe_types)
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/stream.py", line 446, in get_shards
    tmp_filename = self._download_file(basename, basename + '.tmp')
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/stream.py", line 310, in _download_file
    retry(num_attempts=self.download_retry)(
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/util.py", line 525, in new_func
    raise e
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/util.py", line 521, in new_func
    return func(*args, **kwargs)
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/stream.py", line 311, in <lambda>
    lambda: download_file(remote, local, self.download_timeout))()
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/storage/download.py", line 507, in download_file
    download_from_s3(remote, local, timeout)
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/storage/download.py", line 111, in download_from_s3
    raise e
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/storage/download.py", line 101, in download_from_s3
    _download_file(extra_args=extra_args)
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/streaming/base/storage/download.py", line 73, in _download_file
    s3.download_file(obj.netloc,
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/boto3/s3/inject.py", line 190, in download_file
    return transfer.download_file(
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/boto3/s3/transfer.py", line 326, in download_file
    future.result()
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/s3transfer/futures.py", line 103, in result
    return self._coordinator.result()
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/s3transfer/futures.py", line 266, in result
    raise self._exception
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/s3transfer/tasks.py", line 269, in _main
    self._submit(transfer_future=transfer_future, **kwargs)
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/s3transfer/download.py", line 354, in _submit
    response = client.head_object(
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/botocore/client.py", line 530, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/Users/abhijithneilabraham/mambaforge/envs/py/lib/python3.9/site-packages/botocore/client.py", line 964, in _make_api_call
    raise error_class(parsed_response, operation_name)

botocore.exceptions.ClientError: Object s3://dudetest/train_dataset/index.json not found! Either check the bucket path or the bucket permission. If the bucket is a requester pays bucket, then provide the bucket name to the environment variable `MOSAICML_STREAMING_AWS_REQUESTER_PAYS`.

Is this because using minio s3 object with StreamingDataset not supported?

abhijithneilabraham commented 4 months ago

Fixed, needed to set os.environ['S3_ENDPOINT_URL'].