Error downloading mini-imagenet data

learnables / learn2learn

A PyTorch Library for Meta-learning Research

http://learn2learn.net

MIT License

2.62k stars 350 forks source link

Error downloading mini-imagenet data #310

Closed brando90 closed 1 year ago

brando90 commented 2 years ago

Somehow I get the following error:

Downloading: /home/miranda9/data/l2l_data/mini-imagenet-cache-test.pkl
Traceback (most recent call last):
  File "/home/miranda9/miniconda3/envs/meta_learning_a100/lib/python3.9/site-packages/learn2learn/vision/datasets/mini_imagenet.py", line 102, in __init__
    self.data = pickle.load(f)
_pickle.UnpicklingError: invalid load key, '<'.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_mini_imagenet_mi.py", line 268, in <module>
    mi_test()
  File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_mini_imagenet_mi.py", line 255, in mi_test
    args.tasksets: BenchmarkTasksets = get_tasksets(
  File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_mini_imagenet_mi.py", line 212, in get_tasksets
    datasets, transforms = _TASKSETS[name](train_ways=train_ways,
  File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_mini_imagenet_mi.py", line 78, in mini_imagenet_tasksets
    test_dataset = l2l.vision.datasets.MiniImagenet(
  File "/home/miranda9/miniconda3/envs/meta_learning_a100/lib/python3.9/site-packages/learn2learn/vision/datasets/mini_imagenet.py", line 108, in __init__
    self.data = pickle.load(f)
_pickle.UnpicklingError: invalid load key, '<'.

but I only get it in the dgx machine not locally...has anyone had this error before?

cross: https://stackoverflow.com/questions/71094907/how-does-one-download-data-for-a-dgx-a100-machine-that-returns-a-invalid-load-ke

brando90 commented 2 years ago

I've tried re dowloading the data...

brando90 commented 2 years ago

reproduction script:

import learn2learn as l2l

from learn2learn.data.transforms import NWays, KShots, LoadData, RemapLabels, ConsecutiveLabels
from torchvision.transforms import (Compose, ToPILImage, ToTensor, RandomCrop, RandomHorizontalFlip,
                                    ColorJitter, Normalize)

"""
The benchmark modules provides a convenient interface to standardized benchmarks in the literature.
It provides train/validation/test TaskDatasets and TaskTransforms for pre-defined datasets.

This utility is useful for researchers to compare new algorithms against existing benchmarks.
For a more fine-grained control over tasks and data, we recommend directly using `l2l.data.TaskDataset` and `l2l.data.TaskTransforms`.
"""

import os
import learn2learn as l2l

from collections import namedtuple

# from .omniglot_benchmark import omniglot_tasksets
# from .mini_imagenet_benchmark import mini_imagenet_tasksets
# from .tiered_imagenet_benchmark import tiered_imagenet_tasksets
# from .fc100_benchmark import fc100_tasksets
# from .cifarfs_benchmark import cifarfs_tasksets

def mini_imagenet_tasksets(
        train_ways=5,
        train_samples=10,
        test_ways=5,
        test_samples=10,
        root='~/data',
        data_augmentation=None,
        device=None,
        **kwargs,
):
    """Tasksets for mini-ImageNet benchmarks."""
    if data_augmentation is None:
        train_data_transforms = None
        test_data_transforms = None
        assert False
    elif data_augmentation == 'normalize':
        train_data_transforms = Compose([
            lambda x: x / 255.0,
        ])
        test_data_transforms = train_data_transforms
        assert False
    elif data_augmentation == 'lee2019':
        normalize = Normalize(
            mean=[120.39586422 / 255.0, 115.59361427 / 255.0, 104.54012653 / 255.0],
            std=[70.68188272 / 255.0, 68.27635443 / 255.0, 72.54505529 / 255.0],
        )
        train_data_transforms = Compose([
            ToPILImage(),
            RandomCrop(84, padding=8),
            ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ])
        test_data_transforms = Compose([
            normalize,
        ])
    else:
        raise ('Invalid data_augmentation argument.')

    train_dataset = l2l.vision.datasets.MiniImagenet(
        root=root,
        mode='train',
        download=True,
    )
    valid_dataset = l2l.vision.datasets.MiniImagenet(
        root=root,
        mode='validation',
        download=True,
    )
    test_dataset = l2l.vision.datasets.MiniImagenet(
        root=root,
        mode='test',
        download=True,
    )
    if device is None:
        train_dataset.transform = train_data_transforms
        valid_dataset.transform = train_data_transforms
        test_dataset.transform = test_data_transforms
    else:
        train_dataset = l2l.data.OnDeviceDataset(
            dataset=train_dataset,
            transform=train_data_transforms,
            device=device,
        )
        valid_dataset = l2l.data.OnDeviceDataset(
            dataset=valid_dataset,
            transform=train_data_transforms,
            device=device,
        )
        test_dataset = l2l.data.OnDeviceDataset(
            dataset=test_dataset,
            transform=test_data_transforms,
            device=device,
        )
    train_dataset = l2l.data.MetaDataset(train_dataset)
    valid_dataset = l2l.data.MetaDataset(valid_dataset)
    test_dataset = l2l.data.MetaDataset(test_dataset)

    train_transforms = [
        NWays(train_dataset, train_ways),
        KShots(train_dataset, train_samples),
        LoadData(train_dataset),
        RemapLabels(train_dataset),
        ConsecutiveLabels(train_dataset),
    ]
    valid_transforms = [
        NWays(valid_dataset, test_ways),
        KShots(valid_dataset, test_samples),
        LoadData(valid_dataset),
        ConsecutiveLabels(valid_dataset),
        RemapLabels(valid_dataset),
    ]
    test_transforms = [
        NWays(test_dataset, test_ways),
        KShots(test_dataset, test_samples),
        LoadData(test_dataset),
        RemapLabels(test_dataset),
        ConsecutiveLabels(test_dataset),
    ]

    _datasets = (train_dataset, valid_dataset, test_dataset)
    _transforms = (train_transforms, valid_transforms, test_transforms)
    return _datasets, _transforms

__all__ = ['list_tasksets', 'get_tasksets']

BenchmarkTasksets = namedtuple('BenchmarkTasksets', ('train', 'validation', 'test'))

_TASKSETS = {
    # 'omniglot': omniglot_tasksets,
    'mini-imagenet': mini_imagenet_tasksets,
    # 'tiered-imagenet': tiered_imagenet_tasksets,
    # 'fc100': fc100_tasksets,
    # 'cifarfs': cifarfs_tasksets,
}

def list_tasksets():
    """
    [[Source]](https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/benchmarks/)

    **Description**

    Returns a list of all available benchmarks.

    **Example**
    ~~~python
    for name in l2l.vision.benchmarks.list_tasksets():
        print(name)
        tasksets = l2l.vision.benchmarks.get_tasksets(name)
    ~~~
    """
    return _TASKSETS.keys()

def get_tasksets(
        name,
        train_ways=5,
        train_samples=10,
        test_ways=5,
        test_samples=10,
        num_tasks=-1,
        root='~/data',
        device=None,
        **kwargs,
):
    """
    [[Source]](https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/benchmarks/)

    **Description**

    Returns the tasksets for a particular benchmark, using literature standard data and task transformations.

    The returned object is a namedtuple with attributes `train`, `validation`, `test` which
    correspond to their respective TaskDatasets.
    See `examples/vision/maml_miniimagenet.py` for an example.

    **Arguments**

    * **name** (str) - The name of the benchmark. Full list in `list_tasksets()`.
    * **train_ways** (int, *optional*, default=5) - The number of classes per train tasks.
    * **train_samples** (int, *optional*, default=10) - The number of samples per train tasks.
    * **test_ways** (int, *optional*, default=5) - The number of classes per test tasks. Also used for validation tasks.
    * **test_samples** (int, *optional*, default=10) - The number of samples per test tasks. Also used for validation tasks.
    * **num_tasks** (int, *optional*, default=-1) - The number of tasks in each TaskDataset.
    * **device** (torch.Device, *optional*, default=None) - If not None, tasksets are loaded as Tensors on `device`.
    * **root** (str, *optional*, default='~/data') - Where the data is stored.

    **Example**
    ~~~python
    train_tasks, validation_tasks, test_tasks = l2l.vision.benchmarks.get_tasksets('omniglot')
    batch = train_tasks.sample()

    or:

    tasksets = l2l.vision.benchmarks.get_tasksets('omniglot')
    batch = tasksets.train.sample()
    ~~~
    """
    root = os.path.expanduser(root)

    # Load task-specific data and transforms
    datasets, transforms = _TASKSETS[name](train_ways=train_ways,
                                           train_samples=train_samples,
                                           test_ways=test_ways,
                                           test_samples=test_samples,
                                           root=root,
                                           device=device,
                                           **kwargs)
    train_dataset, validation_dataset, test_dataset = datasets
    train_transforms, validation_transforms, test_transforms = transforms

    # Instantiate the tasksets
    train_tasks = l2l.data.TaskDataset(
        dataset=train_dataset,
        task_transforms=train_transforms,
        num_tasks=num_tasks,
    )
    validation_tasks = l2l.data.TaskDataset(
        dataset=validation_dataset,
        task_transforms=validation_transforms,
        num_tasks=num_tasks,
    )
    test_tasks = l2l.data.TaskDataset(
        dataset=test_dataset,
        task_transforms=test_transforms,
        num_tasks=num_tasks,
    )
    return BenchmarkTasksets(train_tasks, validation_tasks, test_tasks)

# --

def mi_test():
    """
python -u ~/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_mini_imagenet_mi.py
    """
    from argparse import Namespace
    from pathlib import Path

    args = Namespace(k_shots=5, k_eval=15, n_classes=5)
    args.data_option = 'mini-imagenet'  # no name assumes l2l, make sure you're calling get_l2l_tasksets
    args.data_path = Path('~/data/l2l_data/').expanduser()
    args.data_augmentation = 'lee2019'

    args.tasksets: BenchmarkTasksets = get_tasksets(
        args.data_option,
        train_samples=args.k_shots + args.k_eval,
        train_ways=args.n_classes,
        test_samples=args.k_shots + args.k_eval,
        test_ways=args.n_classes,
        root=args.data_path,
        data_augmentation=args.data_augmentation,
    )
    print(args.tasksets)

if __name__ == '__main__':
    mi_test()
    print('Done\a\n')

brando90 commented 2 years ago

run above file

brando90 commented 2 years ago

perhaps I can try downloading the file directly? Or rsyncing it from my local....other ideas? Try other hardware e.g. the other cluster I have access...

seba-1511 commented 2 years ago

@brando90, this happens when Google drive limits the number of downloads of a file. Rsyncing a local copy works, else wait a few hours for the limit to reset.

nightlessbaron commented 2 years ago

@seba-1511, I am facing the same issue for:

CUBirds200
Mini-Imagnet
Tiered-Imagenet
CIFARFS

since the past 3-4 days. The downloading issue seems to be persisting.

seba-1511 commented 2 years ago

Thanks for the notification @nightlessbaron.

I’ll re-open, and if the issue persists we can come up with a solution.

nazago commented 2 years ago

A temporal solution I found is the following:

1 Go to /learn2learn/vision/datasets and choose your .py (for example, mini_imagenet.py). 2 Look through the code comments, there are the link from which the script download the test. 3 Click the links and download them with your web browser. 4 Move them to the folder the algorithm look for the dataset (by default ~/data).

This is how I have solved :)

woreom commented 1 year ago

Just use gdown to download your data from google drive

brando90 commented 1 year ago

@seba-1511 has this been solved? It came up again for me:

Traceback (most recent call last):
  File "/dfs/scratch0/brando9/miniconda/envs/metalearning_gpu/lib/python3.9/site-packages/learn2learn/vision/datasets/mini_imagenet.py", line 105, in __init__
    self.data = pickle.load(f)
_pickle.UnpicklingError: invalid load key, '<'.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/dfs/scratch0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 2135, in <module>
    main()
  File "/dfs/scratch0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 2073, in main
    train(args=args)
  File "/dfs/scratch0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 2097, in train
    args.tasksets: BenchmarkTasksets = get_l2l_tasksets(args)
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_ml_tasksets.py", line 107, in get_l2l_tasksets
    args.tasksets: BenchmarkTasksets = hdb1_mi_omniglot_tasksets(
  File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 394, in hdb1_mi_omniglot_tasksets
    _datasets: tuple[IndexableDataSet] = get_indexable_list_of_datasets_mi_and_omniglot(root)
  File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 352, in get_indexable_list_of_datasets_mi_and_omniglot
    dataset_list_train, dataset_list_validation, dataset_list_test = get_mi_and_omniglot_list_data_set_splits(root,
  File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 329, in get_mi_and_omniglot_list_data_set_splits
    train_dataset, validation_dataset, test_dataset = get_mi_datasets(root, data_augmentation, device)
  File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 271, in get_mi_datasets
    train_dataset = l2l.vision.datasets.MiniImagenet(
  File "/dfs/scratch0/brando9/miniconda/envs/metalearning_gpu/lib/python3.9/site-packages/learn2learn/vision/datasets/mini_imagenet.py", line 111, in __init__
    self.data = pickle.load(f)
_pickle.UnpicklingError: invalid load key, '<'.

brando90 commented 1 year ago

self contained code:

def download_mi_l2l_data_selfcontained():
    """

    :return:
    """
    import learn2learn as l2l
    train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/l2l_data', mode='train', download=True)
    train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/l2l_data', mode='validation', download=True)
    train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/l2l_data', mode='test', download=True)

brando90 commented 1 year ago

A temporal solution I found is the following:

1 Go to /learn2learn/vision/datasets and choose your .py (for example, mini_imagenet.py). 2 Look through the code comments, there are the link from which the script download the test. 3 Click the links and download them with your web browser. 4 Move them to the folder the algorithm look for the dataset (by default ~/data).

This is how I have solved :)

@seba why does the url have ~dl=1?

dropbox_file_link = 'https://www.dropbox.com/s/ye9jeb5tyz0x01b/mini-imagenet-cache-test.pkl?dl=1'

I do have a script that sort of downloads a bunch of files and works but not sure if it works with yours:

def download_and_extract(url: str,
                         path_2_ziplike: Path = Path('~/data/'),
                         path_2_dataset: Path = Path('~/data/tmp/'),
                         rm_zip_file: bool = True
                         ):
    """
    Downloads data and tries to extract it according to different protocols/file types.

    Tested with:
    - zip files, yes!

    Later:
    - todo: tar, gz, gdrive
    """
    path_2_ziplike: Path = expanduser(path_2_ziplike)
    path_2_dataset: Path = expanduser(path_2_dataset)
    # - download data
    import ssl
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    print("downloading dataset from ", url)
    import urllib
    import http
    response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
    print(f'{type(response)=}')
    data = response
    # save zipfile like data to path given
    filename = url.rpartition('/')[2]
    print(f'{filename=}')
    # if gdrive_download:  todo, later
    #     from torchvision.datasets.utils import download_file_from_google_drive, extract_archive
    #     file_id = '1rV3aj_hgfNTfCakffpPm7Vhpr1in87CR'
    #     filename_zip = 'miniImagenet.tgz'
    #     # if zip not there re-download it
    #     path_2_zip = path / filename_zip
    #     if not path_2_zip.exists():
    #         download_file_from_google_drive(file_id, path, filename_zip)
    if filename.endswith('.zip'):
        path_2_ziplike.mkdir(parents=True, exist_ok=True)
        path_2_zip_with_filename = path_2_ziplike / filename
        print(f'about to save: {path_2_zip_with_filename=}')
        # wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
        with open(path_2_zip_with_filename, 'wb+') as f:
            f.write(data.read())
        print(f'done saving: {path_2_zip_with_filename=}')
    elif filename.endswith('.gz'):
        # inspired from tinfer, idk why but they don't save the zip file anywhere...cool I suppose?
        # import tarfile
        # try:
        #     file = tarfile.open(fileobj=response, mode="r|gz")
        # except Exception as e:
        #     logging.warning(e)
        #     print('if this fails look at the file extension and try something else '
        #           'e.g. tar cmd or other options in tar module above')
        pass  # do all work in the extraction step
    # elif is_tar_file(filename):
    #     os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
    else:
        raise ValueError(f'File type {filename=} not supported.')

    # - unzip
    extract_to = path_2_dataset
    print(f'about to extract: {path_2_zip_with_filename=}')
    print(f'extract to target: {extract_to=}')
    if filename.endswith('.zip'):
        import zipfile  # this one is for zip files, inspired from l2l
        zip_ref = zipfile.ZipFile(path_2_zip_with_filename, 'r')
        zip_ref.extractall(extract_to)
        zip_ref.close()
        if rm_zip_file:
            path_2_zip_with_filename.unlink()
            # path_2_zip_with_filename.unlink(missing_ok=True)
    elif filename.endswith('.gz'):
        import tarfile
        file = tarfile.open(fileobj=response, mode="r|gz")
        file.extractall(path=extract_to)
        file.close()
    else:
        raise ValueError(f'File type {filename=} not supported, edit code to support it.')

        # path_2_zip_with_filename = path_2_ziplike / filename
        # os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
        # if rm_zip_file:
        #     path_2_zip_with_filename.unlink()
        #     # path_2_zip_with_filename.unlink(missing_ok=True)
        # # raise ValueError(f'File type {filename=} not supported.')
    print(f'done extracting: {path_2_zip_with_filename=}')
    print(f'extracted at location:{path_2_dataset=}')

brando90 commented 1 year ago

this still doesn't work:

def download_mini_imagenet_fix():
    from uutils import download_and_extract
    download_and_extract('https://www.dropbox.com/s/ye9jeb5tyz0x01b/mini-imagenet-cache-test.pkl?dl=1',
                         '~/data/tmp', '~/data/tmp')

argh

def download_and_extract(url: str,
                         path_2_ziplike: Path = Path('~/data/'),
                         path_2_dataset: Path = Path('~/data/tmp/'),
                         rm_zip_file: bool = True
                         ):
    """
    Downloads data and tries to extract it according to different protocols/file types.

    Tested with:
    - zip files, yes!

    Later:
    - todo: tar, gz, gdrive
    """
    path_2_ziplike: Path = expanduser(path_2_ziplike)
    path_2_ziplike.mkdir(parents=True, exist_ok=True)
    path_2_dataset: Path = expanduser(path_2_dataset)
    path_2_dataset.mkdir(parents=True, exist_ok=True)
    # - download data
    import ssl
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    print("downloading dataset from ", url)
    import urllib
    import http
    response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
    print(f'{type(response)=}')
    data = response
    # save zipfile like data to path given
    filename = url.rpartition('/')[2]
    print(f'{filename=}')
    # if gdrive_download:  todo, later
    #     from torchvision.datasets.utils import download_file_from_google_drive, extract_archive
    #     file_id = '1rV3aj_hgfNTfCakffpPm7Vhpr1in87CR'
    #     filename_zip = 'miniImagenet.tgz'
    #     # if zip not there re-download it
    #     path_2_zip = path / filename_zip
    #     if not path_2_zip.exists():
    #         download_file_from_google_drive(file_id, path, filename_zip)
    if filename.endswith('.zip'):
        path_2_zip_with_filename = path_2_ziplike / filename
        print(f'about to save: {path_2_zip_with_filename=}')
        # wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
        with open(path_2_zip_with_filename, 'wb+') as f:
            f.write(data.read())
        print(f'done saving: {path_2_zip_with_filename=}')
    elif filename.endswith('.gz'):
        # inspired from tinfer, idk why but they don't save the zip file anywhere...cool I suppose?
        # import tarfile
        # try:
        #     file = tarfile.open(fileobj=response, mode="r|gz")
        # except Exception as e:
        #     logging.warning(e)
        #     print('if this fails look at the file extension and try something else '
        #           'e.g. tar cmd or other options in tar module above')
        pass  # do all work in the extraction step
    elif filename.endswith('.pkl?dl=1'):
        path_2_zip_with_filename = path_2_ziplike / filename
        print(f'about to save: {path_2_zip_with_filename=}')
        # wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
        with open(path_2_zip_with_filename, 'wb+') as f:
            f.write(data.read())
        print(f'done saving: {path_2_zip_with_filename=}')
    # elif is_tar_file(filename):
    #     os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
    else:
        raise ValueError(f'File type {filename=} not supported.')

    # - unzip
    extract_to = path_2_dataset
    print(f'about to extract: {path_2_zip_with_filename=}')
    print(f'extract to target: {extract_to=}')
    if filename.endswith('.zip'):
        import zipfile  # this one is for zip files, inspired from l2l
        zip_ref = zipfile.ZipFile(path_2_zip_with_filename, 'r')
        zip_ref.extractall(extract_to)
        zip_ref.close()
        if rm_zip_file:
            path_2_zip_with_filename.unlink()
            # path_2_zip_with_filename.unlink(missing_ok=True)
    elif filename.endswith('.gz'):
        import tarfile
        file = tarfile.open(fileobj=response, mode="r|gz")
        file.extractall(path=extract_to)
        file.close()
    elif filename.endswith('.pkl?dl=1'):
        # no need to extract it, but when you use the data make sure you torch.load it or pickle.load it.
        data = torch.load(path_2_zip_with_filename)  # just to test
        assert data is not None
        print(f'{data=}')
        pass
    else:
        raise ValueError(f'File type {filename=} not supported, edit code to support it.')

        # path_2_zip_with_filename = path_2_ziplike / filename
        # os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
        # if rm_zip_file:
        #     path_2_zip_with_filename.unlink()
        #     # path_2_zip_with_filename.unlink(missing_ok=True)
        # # raise ValueError(f'File type {filename=} not supported.')
    print(f'done extracting: {path_2_zip_with_filename=}')
    print(f'extracted at location:{path_2_dataset=}')

brando90 commented 1 year ago

new error:

0it [00:00, ?it/s]
Traceback (most recent call last):
  File "/Users/brandomiranda/opt/anaconda3/envs/meta_learning/lib/python3.9/site-packages/torch/serialization.py", line 608, in load
    return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
  File "/Users/brandomiranda/opt/anaconda3/envs/meta_learning/lib/python3.9/site-packages/torch/serialization.py", line 777, in _legacy_load
    magic_number = pickle_module.load(f, **pickle_load_args)
EOFError: Ran out of input

code:

def download_mini_imagenet_fix_use_gdrive():
    from uutils import download_and_extract
    download_and_extract(None,
                         '~/data/tmp', '~/data/tmp',
                         True,
                         '1wpmY-hmiJUUlRBkO9ZDCXAcIpHEFdOhD', 'mini-imagenet-cache-test.pkl'
                         )

extractor:

def download_and_extract(url: str,
                         path_used_for_zip: Path = Path('~/data/'),
                         path_used_for_dataset: Path = Path('~/data/tmp/'),
                         rm_zip_file: bool = True,
                         gdrive_file_id: Optional[str] = None,
                         gdrive_filename: Optional[str] = None,
                         ):
    """
    Downloads data and tries to extract it according to different protocols/file types.

    Tested with:
    - zip files, yes!

    Later:
    - todo: tar, gz, gdrive
    """
    path_used_for_zip: Path = expanduser(path_used_for_zip)
    path_used_for_zip.mkdir(parents=True, exist_ok=True)
    path_used_for_dataset: Path = expanduser(path_used_for_dataset)
    path_used_for_dataset.mkdir(parents=True, exist_ok=True)
    # - download data
    if gdrive_filename is None:
        import ssl
        ctx = ssl.create_default_context()
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE
        print("downloading dataset from ", url)
        import urllib
        import http
        response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
        print(f'{type(response)=}')
        data = response
        # save zipfile like data to path given
        filename = url.rpartition('/')[2]
        path2file: Path = path_used_for_zip / filename
    else:
        from torchvision.datasets.utils import download_file_from_google_drive
        # if zip not there re-download it
        path2file: Path = path_used_for_zip / gdrive_filename
        if not path2file.exists():
            download_file_from_google_drive(gdrive_file_id, path_used_for_zip, gdrive_filename)
        filename = gdrive_filename
    print(f'{path2file=}')
    print(f'{filename=}')
    if filename.endswith('.zip') or filename.endswith('.pkl'):
        if not path2file.exists():
            print(f'about to download: {path2file=}')
            # wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
            with open(path2file, 'wb+') as f:
                f.write(data.read())
            print(f'done downloading: {path2file=}')
    elif filename.endswith('.gz'):
        pass  # the download of the data doesn't seem to be explicitly handled by me, that is done in the extract step by a magic function tarfile.open
    # elif is_tar_file(filename):
    #     os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
    else:
        raise ValueError(f'File type {filename=} not supported.')

    # - unzip
    extract_to = path_used_for_dataset
    print(f'about to extract: {path2file=}')
    print(f'extract to target: {extract_to=}')
    if filename.endswith('.zip'):
        import zipfile  # this one is for zip files, inspired from l2l
        zip_ref = zipfile.ZipFile(path2file, 'r')
        zip_ref.extractall(extract_to)
        zip_ref.close()
        if rm_zip_file:
            path2file.unlink()
            # path_2_zip_with_filename.unlink(missing_ok=True)
    elif filename.endswith('.gz'):
        import tarfile
        file = tarfile.open(fileobj=response, mode="r|gz")
        file.extractall(path=extract_to)
        file.close()
    elif filename.endswith('.pkl'):
        # no need to extract it, but when you use the data make sure you torch.load it or pickle.load it.
        print(f'about to test torch.load of: {path2file=}')
        data = torch.load(path2file)  # just to test
        assert data is not None
        print(f'{data=}')
        pass
    else:
        raise ValueError(f'File type {filename=} not supported, edit code to support it.')
        # path_2_zip_with_filename = path_2_ziplike / filename
        # os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
        # if rm_zip_file:
        #     path_2_zip_with_filename.unlink()
        #     # path_2_zip_with_filename.unlink(missing_ok=True)
        # # raise ValueError(f'File type {filename=} not supported.')
    print(f'done extracting: {path2file=}')
    print(f'extracted at location:{path_used_for_dataset=}')

brando90 commented 1 year ago

why is the content of your pickle file a webpage @seba-1511 ?

<!DOCTYPE html><html class="maestro global-header" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml"><head><script nonce="fkn3kxSC3JSGmt7VCCOe">
window._goch_ = {};
window.addEventListener('click', function(event) {
    'use strict';
    for (var elm = event.target; elm; elm = elm.parentElement) {
        if (elm.id &&
            window._goch_.hasOwnProperty(elm.id) &&
            window._goch_[elm.id].call(elm, event) === false) {
            event.preventDefault();
        }
    }
}, true);
window._csp_external_script_nonce = "zL3+lyYuZa8JCKWibZJF"</script><link href="https://cfl.dropboxstatic.com" rel="preconnect" /><link href="https://cfl.dropboxstatic.com/static/metaserver/static/images/favicon-vfl8lUR9B.ico" rel="shortcut icon" /><title>Dropbox - mini-imagenet-cache-test.pkl - Simplify your life</title><meta content="noindex, nofollow, noimageindex" name="robots" /><meta content="origin-when-cross-origin" name="referrer" /><script type="text/javascript" nonce="fkn3kxSC3JSGmt7VCCOe">if (window.performance && window.performance.mark) { window.performance.mark("requirejs_start"); }</script> <link crossorigin="anonymous" href="https://cfl.dropboxstatic.com/static/metaserver/static/js/alameda_bundle/alameda_bundle_ie_en-vflsCZVlq.js" as="script" nonce="zL3+lyYuZa8JCKWibZJF" rel="preload" type="text/javascript" /> <script type="text/javascript" nonce="fkn3kxSC3JSGmt7VCCOe">
            (()=>{"use strict";const e=window;let a;const r=()=>{a=[],e.addRequireLoadCallback=e=>a.push(e),e.configureRequire=function(){const a=arguments;e.addRequireLoadCallback(()=>e.configureRequire.apply(null,a))},e.define=function(){const a=arguments;e.addRequireLoadCallback(()=>e.define.apply(null,a))},e.preLoadFile=(...a)=>{e.addRequireLoadCallback(()=>e.preLoadFile.apply(null,a))}};r(),e._insertRequireShim=r,e.InitRequireJs=r=>{e.requireContexts={},e.performance&&null!=e.performance.now&&(requirejs.onResourceLoad=(a,r,i)=>{const l=e.performance.now(),n=e.requireContexts[a.id];if(n){r.id in n.module_callback_times||(n.module_callback_times[r.id]={});const e=n.module_callback_times[r.id];e.loadTime=l,r.url&&(e.url=r.url),r.id&&(e.name=r.id),r.parentMap&&r.parentMap.url&&(e.parent=r.parentMap.url)}}),e.configureRequire=r,e.addRequireLoadCallback=e=>e(),a.forEach(e=>e())}})();

            window.CSP_SCRIPT_NONCE = "fkn3kxSC3JSGmt7VCCOe";
            </script> <script async="async" crossorigin="anonymous" src="https://cfl.dropboxstatic.com/static/metaserver/static/js/alameda_bundle/alameda_bundle_ie_en-vflsCZVlq.js" type="text/javascript" nonce="zL3+lyYuZa8JCKWibZJF"></script> <link crossorigin="anonymous" href="https://cfl.dropboxstatic.com/static/metaserver/static/fonts/paper-atlasgrotesk/AtlasGrotesk-Regular-Web-vflk7bxjs.woff2" as="font" rel="preload" type="font/woff2" /><link cr
...

brando90 commented 1 year ago

https://stackoverflow.com/questions/74383046/why-is-this-code-downloading-a-webpage-instead-of-file-data-i-want

brando90 commented 1 year ago

ok I think this works:

def download_mini_imagenet_brandos_download_from_zenodo():
    """
    zeneodo link of data set: https://zenodo.org/record/7311663#.Y21EE-zMJUc
    """
    from uutils import download_and_extract
    download_and_extract(url='https://zenodo.org/record/7311663/files/brandoslearn2learnminiimagenet.zip',
                         path_used_for_zip='~/data/tmp',
                         path_used_for_dataset='~/data/tmp/l2l_data',
                         rm_zip_file=True,
                         )
    # download_and_extract('https://zenodo.org/record/7311663/files/brandoslearn2learnminiimagenet.zip?download=1',
    #                      '~/data/tmp', '~/data/tmp')
    train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/tmp/l2l_data', mode='train', download=False)
    [data for data in train_dataset]
    train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/tmp/l2l_data', mode='validation', download=False)
    [data for data in train_dataset]
    train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/tmp/l2l_data', mode='test', download=False)
    [data for data in train_dataset]
    for data in train_dataset:
        # print(f'{data=}')
        print(f'{data[0].size()=}')
        print(f'{data[1]=}')
    print('success loop through local data')

def download_and_extract(url: str,
                         path_used_for_zip: Path = Path('~/data/'),
                         path_used_for_dataset: Path = Path('~/data/tmp/'),
                         rm_zip_file: bool = True,
                         gdrive_file_id: Optional[str] = None,
                         gdrive_filename: Optional[str] = None,
                         ):
    """
    Downloads data and tries to extract it according to different protocols/file types.

    Tested with:
    - zip files, yes!

    Later:
    - todo: tar, gz, gdrive
    """
    path_used_for_zip: Path = expanduser(path_used_for_zip)
    path_used_for_zip.mkdir(parents=True, exist_ok=True)
    path_used_for_dataset: Path = expanduser(path_used_for_dataset)
    path_used_for_dataset.mkdir(parents=True, exist_ok=True)
    # - download data
    if gdrive_filename is None:  # not a gdrive download
        import ssl
        ctx = ssl.create_default_context()
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE
        print("downloading dataset from url: ", url)
        import urllib
        import http
        response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
        print(f'{type(response)=}')
        data = response
        # save zipfile like data to path given
        filename = url.rpartition('/')[2]
        path2file: Path = path_used_for_zip / filename
    else:
        from torchvision.datasets.utils import download_file_from_google_drive
        # if zip not there re-download it
        path2file: Path = path_used_for_zip / gdrive_filename
        if not path2file.exists():
            download_file_from_google_drive(gdrive_file_id, path_used_for_zip, gdrive_filename)
        filename = gdrive_filename
    print(f'{path2file=}')
    print(f'{filename=}')
    if filename.endswith('.zip') or filename.endswith('.pkl'):
        if not path2file.exists():
            print(f'about to download data to: {path2file=}')
            # wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
            with open(path2file, 'wb+') as f:
            # with open(path2file, 'w+') as f:
                f.write(data.read())
            print(f'done downloading data to: {path2file=}')
    elif filename.endswith('.gz'):
        pass  # the download of the data doesn't seem to be explicitly handled by me, that is done in the extract step by a magic function tarfile.open
    # elif is_tar_file(filename):
    #     os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
    else:
        raise ValueError(f'File type {filename=} not supported.')

    # - unzip
    extract_to = path_used_for_dataset
    print(f'about to extract: {path2file=}')
    print(f'extract to target: {extract_to=}')
    if filename.endswith('.zip'):
        import zipfile  # this one is for zip files, inspired from l2l
        zip_ref = zipfile.ZipFile(path2file, 'r')
        zip_ref.extractall(extract_to)
        zip_ref.close()
        if rm_zip_file:
            path2file.unlink()
            # path_2_zip_with_filename.unlink(missing_ok=True)
    elif filename.endswith('.gz'):
        import tarfile
        file = tarfile.open(fileobj=response, mode="r|gz")
        file.extractall(path=extract_to)
        file.close()
    elif filename.endswith('.pkl'):
        # no need to extract it, but when you use the data make sure you torch.load it or pickle.load it.
        print(f'about to test torch.load of: {path2file=}')
        data = torch.load(path2file)  # just to test
        assert data is not None
        print(f'{data=}')
        pass
    else:
        raise ValueError(f'File type {filename=} not supported, edit code to support it.')
        # path_2_zip_with_filename = path_2_ziplike / filename
        # os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
        # if rm_zip_file:
        #     path_2_zip_with_filename.unlink()
        #     # path_2_zip_with_filename.unlink(missing_ok=True)
        # # raise ValueError(f'File type {filename=} not supported.')
    print(f'done extracting: {path2file=}')
    print(f'extracted at location:{path_used_for_dataset=}')

you cna go to my ultimate utils library to get the most recent verison, it's on pypi too. Data set link: https://zenodo.org/record/7311663#.Y21EE-zMJUc

brando90 commented 1 year ago

@seba-1511 may I get help fixing this?

brando90 commented 1 year ago

now even though I re-download it from the link I made it always has an issue loading the data. So my script tries to torch.load the pkl files but fails and ends up deciding to re-download it...

brando90 commented 1 year ago

Do I need to open both the booking file and the other one?

brando90 commented 1 year ago

code:

def should_we_redownload_mi_data_set(root: Union[str, Path]) -> bool:
    """
    If any of the pickle files is missing or loading the pickle data returned an error,
    return True i.e. yes you need to redownload the data a file is missing or there is a corrupted file.

    mini-imagenet-bookkeeping-{split}.pkl
    mini-imagenet-cache-{split}.pkl
    """
    root: Path = expanduser(root)
    splits: list[str] = ['train', 'validation', 'test']
    # filenames: list[str] = [f'mini-imagenet-bookkeeping-{split}.pkl', f'mini-imagenet-cache-{split}.pkl']
    # for filename in filenames:
    for split in splits:
        # -
        filename1: str = f'mini-imagenet-bookkeeping-{split}.pkl'
        path2file: Path = root / filename1
        if not path2file.exists():
            print(f'This file does NOT exist :{path2file=}, so we are redownloading MI')
            return True
        if not succeeded_opening_pkl_data_mi(path2file):
            return True
        # -
        filename2: str = f'mini-imagenet-cache-{split}.pkl'
        path2file: Path = root / filename2
        if not path2file.exists():
            print(f'This file does NOT exist :{path2file=}, so we are redownloading MI')
            return True
        if not succeeded_opening_pkl_data_mi(path2file):
            return True
    return False

def succeeded_opening_pkl_data_mi(path2file: Union[str, Path]) -> bool:
    path2file: Path = expanduser(path2file)
    try:
        data = torch.load(path2file)
        assert data is not None, f'Err: {data=}'
    except Exception as e:
        import logging
        print(f'Was not able to open the l2l data with torch.load, got error: {e=}')
        logging.warning(e)
        return False
    return True

kuihao commented 1 year ago

When I use the official code train_dataset = l2l.vision.datasets.MiniImagenet(root='./data', mode='train'), I get the error FileNotFoundError: [Errno 2] No such file or directory: './data/mini-imagenet-cache-train.pkl'. Currently, I can resolve this issue by manually downloading the file from Google Drive or Dropbox.

However, upon checking the source code (https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/datasets/mini_imagenet.py#L87), I noticed that the download_pkl() function in the class MiniImagenet(data.Dataset) takes the following Google Drive file IDs as input: google_drive_file_id = '1wpmY-hmiJUUlRBkO9ZDCXAcIpHEFdOhD' google_drive_file_id = '1I3itTXpXxGV68olxM5roceUMG8itH9Xj' google_drive_file_id = '1KY5e491bkLFqJDp0-UWou3463Mo8AOco'. However, when I try to use gdown in Colab to download these files, I get the error "Access denied with the following error: Cannot retrieve the public link of the file. You may need to change the permission to 'Anyone with the link', or have had many accesses." Therefore, I believe that the issue may actually be with the permissions on Google Drive, which need to be set to "Anyone with the link" in order for the download_pkl() function to work properly.

The current bug causing the error is that the function download_file_from_google_drive() does not check for any error messages in the response, so the program assumes that the file has been successfully downloaded, when in fact nothing has been downloaded.

brando90 commented 1 year ago

Did u check my alternative way to download the data?

I wish l2l had an official solution

On Thu, Dec 29, 2022, 7:28 AM Kuihao @.***> wrote:

When I use the official code train_dataset = l2l.vision.datasets.MiniImagenet(root='./data', mode='train'), I get the error FileNotFoundError: [Errno 2] No such file or directory: './data/mini-imagenet-cache-train.pkl'. Currently, I can resolve this issue by manually downloading the file from Google Drive or Dropbox.

test:

https://drive.google.com/uc?id=1wpmY-hmiJUUlRBkO9ZDCXAcIpHEFdOhD

https://www.dropbox.com/s/ye9jeb5tyz0x01b/mini-imagenet-cache-test.pkl?dl=1

train:

https://drive.google.com/uc?id=1I3itTXpXxGV68olxM5roceUMG8itH9Xj

https://www.dropbox.com/s/9g8c6w345s2ek03/mini-imagenet-cache-train.pkl?dl=1

valid:

https://drive.google.com/uc?id=1KY5e491bkLFqJDp0-UWou3463Mo8AOco

https://www.dropbox.com/s/ip1b7se3gij3r1b/mini-imagenet-cache-validation.pkl?dl=1

However, upon checking the source code ( https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/datasets/mini_imagenet.py#L87), I noticed that the download_pkl function in the class MiniImagenet(data.Dataset) takes the following Google Drive file IDs as input: google_drive_file_id = '1wpmY-hmiJUUlRBkO9ZDCXAcIpHEFdOhD' google_drive_file_id = '1I3itTXpXxGV68olxM5roceUMG8itH9Xj' google_drive_file_id = '1KY5e491bkLFqJDp0-UWou3463Mo8AOco'. However, when I try to use gdown in Colab to download these files, I get the error "Access denied with the following error: Cannot retrieve the public link of the file. You may need to change the permission to 'Anyone with the link', or have had many accesses." Therefore, I believe that the issue may actually be with the permissions on Google Drive, which need to be set to "Anyone with the link" in order for the download_pkl() function to work properly.

The current bug causing the error is that the function download_file_from_google_drive() does not check for any error messages in the response, so the program assumes that the file has been successfully downloaded, when in fact nothing has been downloaded.

— Reply to this email directly, view it on GitHub https://github.com/learnables/learn2learn/issues/310#issuecomment-1367319195, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAOE6LX7JAI2P75MEXI7DG3WPWGX7ANCNFSM5OHKIMLA . You are receiving this because you were mentioned.Message ID: @.***>

kuihao commented 1 year ago

These links are from the official source code, which is a quick and effective way to solve file download problems. As I mentioned above, the official code itself is flawed and does not check the https response

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params={'id': id}, stream=True)
    token = get_confirm_token(response)
    if token:
        params = {'id': id, 'confirm': token}
        response = session.get(URL, params=params, stream=True) # <--- bug is here
    save_response_content(response, destination)

this resulting in a false error detection in the main source code that does not work.

        pickle_file = os.path.join(self.root, 'mini-imagenet-cache-' + mode + '.pkl')
        try:
            if not self._check_exists() and download:
                print('Downloading mini-ImageNet --', mode)
                download_pkl(google_drive_file_id, self.root, mode)
            with open(pickle_file, 'rb') as f:
                self.data = pickle.load(f)
        except pickle.UnpicklingError: # <--- Exception handling cannot be triggered!! Can not detect the permission error (no http response) 
            if not self._check_exists() and download:
                print('Download failed. Re-trying mini-ImageNet --', mode)
                download_file(dropbox_file_link, pickle_file)
            with open(pickle_file, 'rb') as f:
                self.data = pickle.load(f)

Did u check my alternative way to download the data? I wish l2l had an official solution

yzy1996 commented 1 year ago

The code will download a temporary .pkl file for you. Then I choose to download manually.

seba-1511 commented 1 year ago

Closing: will be fixed as soon as we merge #400.