Closed brando90 closed 1 year ago
I've tried re dowloading the data...
reproduction script:
import learn2learn as l2l
from learn2learn.data.transforms import NWays, KShots, LoadData, RemapLabels, ConsecutiveLabels
from torchvision.transforms import (Compose, ToPILImage, ToTensor, RandomCrop, RandomHorizontalFlip,
ColorJitter, Normalize)
"""
The benchmark modules provides a convenient interface to standardized benchmarks in the literature.
It provides train/validation/test TaskDatasets and TaskTransforms for pre-defined datasets.
This utility is useful for researchers to compare new algorithms against existing benchmarks.
For a more fine-grained control over tasks and data, we recommend directly using `l2l.data.TaskDataset` and `l2l.data.TaskTransforms`.
"""
import os
import learn2learn as l2l
from collections import namedtuple
# from .omniglot_benchmark import omniglot_tasksets
# from .mini_imagenet_benchmark import mini_imagenet_tasksets
# from .tiered_imagenet_benchmark import tiered_imagenet_tasksets
# from .fc100_benchmark import fc100_tasksets
# from .cifarfs_benchmark import cifarfs_tasksets
def mini_imagenet_tasksets(
train_ways=5,
train_samples=10,
test_ways=5,
test_samples=10,
root='~/data',
data_augmentation=None,
device=None,
**kwargs,
):
"""Tasksets for mini-ImageNet benchmarks."""
if data_augmentation is None:
train_data_transforms = None
test_data_transforms = None
assert False
elif data_augmentation == 'normalize':
train_data_transforms = Compose([
lambda x: x / 255.0,
])
test_data_transforms = train_data_transforms
assert False
elif data_augmentation == 'lee2019':
normalize = Normalize(
mean=[120.39586422 / 255.0, 115.59361427 / 255.0, 104.54012653 / 255.0],
std=[70.68188272 / 255.0, 68.27635443 / 255.0, 72.54505529 / 255.0],
)
train_data_transforms = Compose([
ToPILImage(),
RandomCrop(84, padding=8),
ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
RandomHorizontalFlip(),
ToTensor(),
normalize,
])
test_data_transforms = Compose([
normalize,
])
else:
raise ('Invalid data_augmentation argument.')
train_dataset = l2l.vision.datasets.MiniImagenet(
root=root,
mode='train',
download=True,
)
valid_dataset = l2l.vision.datasets.MiniImagenet(
root=root,
mode='validation',
download=True,
)
test_dataset = l2l.vision.datasets.MiniImagenet(
root=root,
mode='test',
download=True,
)
if device is None:
train_dataset.transform = train_data_transforms
valid_dataset.transform = train_data_transforms
test_dataset.transform = test_data_transforms
else:
train_dataset = l2l.data.OnDeviceDataset(
dataset=train_dataset,
transform=train_data_transforms,
device=device,
)
valid_dataset = l2l.data.OnDeviceDataset(
dataset=valid_dataset,
transform=train_data_transforms,
device=device,
)
test_dataset = l2l.data.OnDeviceDataset(
dataset=test_dataset,
transform=test_data_transforms,
device=device,
)
train_dataset = l2l.data.MetaDataset(train_dataset)
valid_dataset = l2l.data.MetaDataset(valid_dataset)
test_dataset = l2l.data.MetaDataset(test_dataset)
train_transforms = [
NWays(train_dataset, train_ways),
KShots(train_dataset, train_samples),
LoadData(train_dataset),
RemapLabels(train_dataset),
ConsecutiveLabels(train_dataset),
]
valid_transforms = [
NWays(valid_dataset, test_ways),
KShots(valid_dataset, test_samples),
LoadData(valid_dataset),
ConsecutiveLabels(valid_dataset),
RemapLabels(valid_dataset),
]
test_transforms = [
NWays(test_dataset, test_ways),
KShots(test_dataset, test_samples),
LoadData(test_dataset),
RemapLabels(test_dataset),
ConsecutiveLabels(test_dataset),
]
_datasets = (train_dataset, valid_dataset, test_dataset)
_transforms = (train_transforms, valid_transforms, test_transforms)
return _datasets, _transforms
__all__ = ['list_tasksets', 'get_tasksets']
BenchmarkTasksets = namedtuple('BenchmarkTasksets', ('train', 'validation', 'test'))
_TASKSETS = {
# 'omniglot': omniglot_tasksets,
'mini-imagenet': mini_imagenet_tasksets,
# 'tiered-imagenet': tiered_imagenet_tasksets,
# 'fc100': fc100_tasksets,
# 'cifarfs': cifarfs_tasksets,
}
def list_tasksets():
"""
[[Source]](https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/benchmarks/)
**Description**
Returns a list of all available benchmarks.
**Example**
~~~python
for name in l2l.vision.benchmarks.list_tasksets():
print(name)
tasksets = l2l.vision.benchmarks.get_tasksets(name)
~~~
"""
return _TASKSETS.keys()
def get_tasksets(
name,
train_ways=5,
train_samples=10,
test_ways=5,
test_samples=10,
num_tasks=-1,
root='~/data',
device=None,
**kwargs,
):
"""
[[Source]](https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/benchmarks/)
**Description**
Returns the tasksets for a particular benchmark, using literature standard data and task transformations.
The returned object is a namedtuple with attributes `train`, `validation`, `test` which
correspond to their respective TaskDatasets.
See `examples/vision/maml_miniimagenet.py` for an example.
**Arguments**
* **name** (str) - The name of the benchmark. Full list in `list_tasksets()`.
* **train_ways** (int, *optional*, default=5) - The number of classes per train tasks.
* **train_samples** (int, *optional*, default=10) - The number of samples per train tasks.
* **test_ways** (int, *optional*, default=5) - The number of classes per test tasks. Also used for validation tasks.
* **test_samples** (int, *optional*, default=10) - The number of samples per test tasks. Also used for validation tasks.
* **num_tasks** (int, *optional*, default=-1) - The number of tasks in each TaskDataset.
* **device** (torch.Device, *optional*, default=None) - If not None, tasksets are loaded as Tensors on `device`.
* **root** (str, *optional*, default='~/data') - Where the data is stored.
**Example**
~~~python
train_tasks, validation_tasks, test_tasks = l2l.vision.benchmarks.get_tasksets('omniglot')
batch = train_tasks.sample()
or:
tasksets = l2l.vision.benchmarks.get_tasksets('omniglot')
batch = tasksets.train.sample()
~~~
"""
root = os.path.expanduser(root)
# Load task-specific data and transforms
datasets, transforms = _TASKSETS[name](train_ways=train_ways,
train_samples=train_samples,
test_ways=test_ways,
test_samples=test_samples,
root=root,
device=device,
**kwargs)
train_dataset, validation_dataset, test_dataset = datasets
train_transforms, validation_transforms, test_transforms = transforms
# Instantiate the tasksets
train_tasks = l2l.data.TaskDataset(
dataset=train_dataset,
task_transforms=train_transforms,
num_tasks=num_tasks,
)
validation_tasks = l2l.data.TaskDataset(
dataset=validation_dataset,
task_transforms=validation_transforms,
num_tasks=num_tasks,
)
test_tasks = l2l.data.TaskDataset(
dataset=test_dataset,
task_transforms=test_transforms,
num_tasks=num_tasks,
)
return BenchmarkTasksets(train_tasks, validation_tasks, test_tasks)
# --
def mi_test():
"""
python -u ~/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_mini_imagenet_mi.py
"""
from argparse import Namespace
from pathlib import Path
args = Namespace(k_shots=5, k_eval=15, n_classes=5)
args.data_option = 'mini-imagenet' # no name assumes l2l, make sure you're calling get_l2l_tasksets
args.data_path = Path('~/data/l2l_data/').expanduser()
args.data_augmentation = 'lee2019'
args.tasksets: BenchmarkTasksets = get_tasksets(
args.data_option,
train_samples=args.k_shots + args.k_eval,
train_ways=args.n_classes,
test_samples=args.k_shots + args.k_eval,
test_ways=args.n_classes,
root=args.data_path,
data_augmentation=args.data_augmentation,
)
print(args.tasksets)
if __name__ == '__main__':
mi_test()
print('Done\a\n')
run above file
perhaps I can try downloading the file directly? Or rsyncing it from my local....other ideas? Try other hardware e.g. the other cluster I have access...
@brando90, this happens when Google drive limits the number of downloads of a file. Rsyncing a local copy works, else wait a few hours for the limit to reset.
@seba-1511, I am facing the same issue for:
since the past 3-4 days. The downloading issue seems to be persisting.
Thanks for the notification @nightlessbaron.
I’ll re-open, and if the issue persists we can come up with a solution.
A temporal solution I found is the following:
1 Go to /learn2learn/vision/datasets and choose your
This is how I have solved :)
Just use gdown to download your data from google drive
@seba-1511 has this been solved? It came up again for me:
Traceback (most recent call last):
File "/dfs/scratch0/brando9/miniconda/envs/metalearning_gpu/lib/python3.9/site-packages/learn2learn/vision/datasets/mini_imagenet.py", line 105, in __init__
self.data = pickle.load(f)
_pickle.UnpicklingError: invalid load key, '<'.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/dfs/scratch0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 2135, in <module>
main()
File "/dfs/scratch0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 2073, in main
train(args=args)
File "/dfs/scratch0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 2097, in train
args.tasksets: BenchmarkTasksets = get_l2l_tasksets(args)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/dataloaders/meta_learning/l2l_ml_tasksets.py", line 107, in get_l2l_tasksets
args.tasksets: BenchmarkTasksets = hdb1_mi_omniglot_tasksets(
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 394, in hdb1_mi_omniglot_tasksets
_datasets: tuple[IndexableDataSet] = get_indexable_list_of_datasets_mi_and_omniglot(root)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 352, in get_indexable_list_of_datasets_mi_and_omniglot
dataset_list_train, dataset_list_validation, dataset_list_test = get_mi_and_omniglot_list_data_set_splits(root,
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 329, in get_mi_and_omniglot_list_data_set_splits
train_dataset, validation_dataset, test_dataset = get_mi_datasets(root, data_augmentation, device)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/dataloaders/hdb1_mi_omniglot_l2l.py", line 271, in get_mi_datasets
train_dataset = l2l.vision.datasets.MiniImagenet(
File "/dfs/scratch0/brando9/miniconda/envs/metalearning_gpu/lib/python3.9/site-packages/learn2learn/vision/datasets/mini_imagenet.py", line 111, in __init__
self.data = pickle.load(f)
_pickle.UnpicklingError: invalid load key, '<'.
self contained code:
def download_mi_l2l_data_selfcontained():
"""
:return:
"""
import learn2learn as l2l
train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/l2l_data', mode='train', download=True)
train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/l2l_data', mode='validation', download=True)
train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/l2l_data', mode='test', download=True)
A temporal solution I found is the following:
1 Go to /learn2learn/vision/datasets and choose your .py (for example, mini_imagenet.py). 2 Look through the code comments, there are the link from which the script download the test. 3 Click the links and download them with your web browser. 4 Move them to the folder the algorithm look for the dataset (by default ~/data).
This is how I have solved :)
@seba why does the url have ~dl=1
?
dropbox_file_link = 'https://www.dropbox.com/s/ye9jeb5tyz0x01b/mini-imagenet-cache-test.pkl?dl=1'
?
I do have a script that sort of downloads a bunch of files and works but not sure if it works with yours:
def download_and_extract(url: str,
path_2_ziplike: Path = Path('~/data/'),
path_2_dataset: Path = Path('~/data/tmp/'),
rm_zip_file: bool = True
):
"""
Downloads data and tries to extract it according to different protocols/file types.
Tested with:
- zip files, yes!
Later:
- todo: tar, gz, gdrive
"""
path_2_ziplike: Path = expanduser(path_2_ziplike)
path_2_dataset: Path = expanduser(path_2_dataset)
# - download data
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
print("downloading dataset from ", url)
import urllib
import http
response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
print(f'{type(response)=}')
data = response
# save zipfile like data to path given
filename = url.rpartition('/')[2]
print(f'{filename=}')
# if gdrive_download: todo, later
# from torchvision.datasets.utils import download_file_from_google_drive, extract_archive
# file_id = '1rV3aj_hgfNTfCakffpPm7Vhpr1in87CR'
# filename_zip = 'miniImagenet.tgz'
# # if zip not there re-download it
# path_2_zip = path / filename_zip
# if not path_2_zip.exists():
# download_file_from_google_drive(file_id, path, filename_zip)
if filename.endswith('.zip'):
path_2_ziplike.mkdir(parents=True, exist_ok=True)
path_2_zip_with_filename = path_2_ziplike / filename
print(f'about to save: {path_2_zip_with_filename=}')
# wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
with open(path_2_zip_with_filename, 'wb+') as f:
f.write(data.read())
print(f'done saving: {path_2_zip_with_filename=}')
elif filename.endswith('.gz'):
# inspired from tinfer, idk why but they don't save the zip file anywhere...cool I suppose?
# import tarfile
# try:
# file = tarfile.open(fileobj=response, mode="r|gz")
# except Exception as e:
# logging.warning(e)
# print('if this fails look at the file extension and try something else '
# 'e.g. tar cmd or other options in tar module above')
pass # do all work in the extraction step
# elif is_tar_file(filename):
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
else:
raise ValueError(f'File type {filename=} not supported.')
# - unzip
extract_to = path_2_dataset
print(f'about to extract: {path_2_zip_with_filename=}')
print(f'extract to target: {extract_to=}')
if filename.endswith('.zip'):
import zipfile # this one is for zip files, inspired from l2l
zip_ref = zipfile.ZipFile(path_2_zip_with_filename, 'r')
zip_ref.extractall(extract_to)
zip_ref.close()
if rm_zip_file:
path_2_zip_with_filename.unlink()
# path_2_zip_with_filename.unlink(missing_ok=True)
elif filename.endswith('.gz'):
import tarfile
file = tarfile.open(fileobj=response, mode="r|gz")
file.extractall(path=extract_to)
file.close()
else:
raise ValueError(f'File type {filename=} not supported, edit code to support it.')
# path_2_zip_with_filename = path_2_ziplike / filename
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
# if rm_zip_file:
# path_2_zip_with_filename.unlink()
# # path_2_zip_with_filename.unlink(missing_ok=True)
# # raise ValueError(f'File type {filename=} not supported.')
print(f'done extracting: {path_2_zip_with_filename=}')
print(f'extracted at location:{path_2_dataset=}')
this still doesn't work:
def download_mini_imagenet_fix():
from uutils import download_and_extract
download_and_extract('https://www.dropbox.com/s/ye9jeb5tyz0x01b/mini-imagenet-cache-test.pkl?dl=1',
'~/data/tmp', '~/data/tmp')
argh
def download_and_extract(url: str,
path_2_ziplike: Path = Path('~/data/'),
path_2_dataset: Path = Path('~/data/tmp/'),
rm_zip_file: bool = True
):
"""
Downloads data and tries to extract it according to different protocols/file types.
Tested with:
- zip files, yes!
Later:
- todo: tar, gz, gdrive
"""
path_2_ziplike: Path = expanduser(path_2_ziplike)
path_2_ziplike.mkdir(parents=True, exist_ok=True)
path_2_dataset: Path = expanduser(path_2_dataset)
path_2_dataset.mkdir(parents=True, exist_ok=True)
# - download data
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
print("downloading dataset from ", url)
import urllib
import http
response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
print(f'{type(response)=}')
data = response
# save zipfile like data to path given
filename = url.rpartition('/')[2]
print(f'{filename=}')
# if gdrive_download: todo, later
# from torchvision.datasets.utils import download_file_from_google_drive, extract_archive
# file_id = '1rV3aj_hgfNTfCakffpPm7Vhpr1in87CR'
# filename_zip = 'miniImagenet.tgz'
# # if zip not there re-download it
# path_2_zip = path / filename_zip
# if not path_2_zip.exists():
# download_file_from_google_drive(file_id, path, filename_zip)
if filename.endswith('.zip'):
path_2_zip_with_filename = path_2_ziplike / filename
print(f'about to save: {path_2_zip_with_filename=}')
# wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
with open(path_2_zip_with_filename, 'wb+') as f:
f.write(data.read())
print(f'done saving: {path_2_zip_with_filename=}')
elif filename.endswith('.gz'):
# inspired from tinfer, idk why but they don't save the zip file anywhere...cool I suppose?
# import tarfile
# try:
# file = tarfile.open(fileobj=response, mode="r|gz")
# except Exception as e:
# logging.warning(e)
# print('if this fails look at the file extension and try something else '
# 'e.g. tar cmd or other options in tar module above')
pass # do all work in the extraction step
elif filename.endswith('.pkl?dl=1'):
path_2_zip_with_filename = path_2_ziplike / filename
print(f'about to save: {path_2_zip_with_filename=}')
# wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
with open(path_2_zip_with_filename, 'wb+') as f:
f.write(data.read())
print(f'done saving: {path_2_zip_with_filename=}')
# elif is_tar_file(filename):
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
else:
raise ValueError(f'File type {filename=} not supported.')
# - unzip
extract_to = path_2_dataset
print(f'about to extract: {path_2_zip_with_filename=}')
print(f'extract to target: {extract_to=}')
if filename.endswith('.zip'):
import zipfile # this one is for zip files, inspired from l2l
zip_ref = zipfile.ZipFile(path_2_zip_with_filename, 'r')
zip_ref.extractall(extract_to)
zip_ref.close()
if rm_zip_file:
path_2_zip_with_filename.unlink()
# path_2_zip_with_filename.unlink(missing_ok=True)
elif filename.endswith('.gz'):
import tarfile
file = tarfile.open(fileobj=response, mode="r|gz")
file.extractall(path=extract_to)
file.close()
elif filename.endswith('.pkl?dl=1'):
# no need to extract it, but when you use the data make sure you torch.load it or pickle.load it.
data = torch.load(path_2_zip_with_filename) # just to test
assert data is not None
print(f'{data=}')
pass
else:
raise ValueError(f'File type {filename=} not supported, edit code to support it.')
# path_2_zip_with_filename = path_2_ziplike / filename
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
# if rm_zip_file:
# path_2_zip_with_filename.unlink()
# # path_2_zip_with_filename.unlink(missing_ok=True)
# # raise ValueError(f'File type {filename=} not supported.')
print(f'done extracting: {path_2_zip_with_filename=}')
print(f'extracted at location:{path_2_dataset=}')
new error:
0it [00:00, ?it/s]
Traceback (most recent call last):
File "/Users/brandomiranda/opt/anaconda3/envs/meta_learning/lib/python3.9/site-packages/torch/serialization.py", line 608, in load
return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
File "/Users/brandomiranda/opt/anaconda3/envs/meta_learning/lib/python3.9/site-packages/torch/serialization.py", line 777, in _legacy_load
magic_number = pickle_module.load(f, **pickle_load_args)
EOFError: Ran out of input
code:
def download_mini_imagenet_fix_use_gdrive():
from uutils import download_and_extract
download_and_extract(None,
'~/data/tmp', '~/data/tmp',
True,
'1wpmY-hmiJUUlRBkO9ZDCXAcIpHEFdOhD', 'mini-imagenet-cache-test.pkl'
)
extractor:
def download_and_extract(url: str,
path_used_for_zip: Path = Path('~/data/'),
path_used_for_dataset: Path = Path('~/data/tmp/'),
rm_zip_file: bool = True,
gdrive_file_id: Optional[str] = None,
gdrive_filename: Optional[str] = None,
):
"""
Downloads data and tries to extract it according to different protocols/file types.
Tested with:
- zip files, yes!
Later:
- todo: tar, gz, gdrive
"""
path_used_for_zip: Path = expanduser(path_used_for_zip)
path_used_for_zip.mkdir(parents=True, exist_ok=True)
path_used_for_dataset: Path = expanduser(path_used_for_dataset)
path_used_for_dataset.mkdir(parents=True, exist_ok=True)
# - download data
if gdrive_filename is None:
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
print("downloading dataset from ", url)
import urllib
import http
response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
print(f'{type(response)=}')
data = response
# save zipfile like data to path given
filename = url.rpartition('/')[2]
path2file: Path = path_used_for_zip / filename
else:
from torchvision.datasets.utils import download_file_from_google_drive
# if zip not there re-download it
path2file: Path = path_used_for_zip / gdrive_filename
if not path2file.exists():
download_file_from_google_drive(gdrive_file_id, path_used_for_zip, gdrive_filename)
filename = gdrive_filename
print(f'{path2file=}')
print(f'{filename=}')
if filename.endswith('.zip') or filename.endswith('.pkl'):
if not path2file.exists():
print(f'about to download: {path2file=}')
# wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
with open(path2file, 'wb+') as f:
f.write(data.read())
print(f'done downloading: {path2file=}')
elif filename.endswith('.gz'):
pass # the download of the data doesn't seem to be explicitly handled by me, that is done in the extract step by a magic function tarfile.open
# elif is_tar_file(filename):
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
else:
raise ValueError(f'File type {filename=} not supported.')
# - unzip
extract_to = path_used_for_dataset
print(f'about to extract: {path2file=}')
print(f'extract to target: {extract_to=}')
if filename.endswith('.zip'):
import zipfile # this one is for zip files, inspired from l2l
zip_ref = zipfile.ZipFile(path2file, 'r')
zip_ref.extractall(extract_to)
zip_ref.close()
if rm_zip_file:
path2file.unlink()
# path_2_zip_with_filename.unlink(missing_ok=True)
elif filename.endswith('.gz'):
import tarfile
file = tarfile.open(fileobj=response, mode="r|gz")
file.extractall(path=extract_to)
file.close()
elif filename.endswith('.pkl'):
# no need to extract it, but when you use the data make sure you torch.load it or pickle.load it.
print(f'about to test torch.load of: {path2file=}')
data = torch.load(path2file) # just to test
assert data is not None
print(f'{data=}')
pass
else:
raise ValueError(f'File type {filename=} not supported, edit code to support it.')
# path_2_zip_with_filename = path_2_ziplike / filename
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
# if rm_zip_file:
# path_2_zip_with_filename.unlink()
# # path_2_zip_with_filename.unlink(missing_ok=True)
# # raise ValueError(f'File type {filename=} not supported.')
print(f'done extracting: {path2file=}')
print(f'extracted at location:{path_used_for_dataset=}')
why is the content of your pickle file a webpage @seba-1511 ?
<!DOCTYPE html><html class="maestro global-header" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml"><head><script nonce="fkn3kxSC3JSGmt7VCCOe">
window._goch_ = {};
window.addEventListener('click', function(event) {
'use strict';
for (var elm = event.target; elm; elm = elm.parentElement) {
if (elm.id &&
window._goch_.hasOwnProperty(elm.id) &&
window._goch_[elm.id].call(elm, event) === false) {
event.preventDefault();
}
}
}, true);
window._csp_external_script_nonce = "zL3+lyYuZa8JCKWibZJF"</script><link href="https://cfl.dropboxstatic.com" rel="preconnect" /><link href="https://cfl.dropboxstatic.com/static/metaserver/static/images/favicon-vfl8lUR9B.ico" rel="shortcut icon" /><title>Dropbox - mini-imagenet-cache-test.pkl - Simplify your life</title><meta content="noindex, nofollow, noimageindex" name="robots" /><meta content="origin-when-cross-origin" name="referrer" /><script type="text/javascript" nonce="fkn3kxSC3JSGmt7VCCOe">if (window.performance && window.performance.mark) { window.performance.mark("requirejs_start"); }</script> <link crossorigin="anonymous" href="https://cfl.dropboxstatic.com/static/metaserver/static/js/alameda_bundle/alameda_bundle_ie_en-vflsCZVlq.js" as="script" nonce="zL3+lyYuZa8JCKWibZJF" rel="preload" type="text/javascript" /> <script type="text/javascript" nonce="fkn3kxSC3JSGmt7VCCOe">
(()=>{"use strict";const e=window;let a;const r=()=>{a=[],e.addRequireLoadCallback=e=>a.push(e),e.configureRequire=function(){const a=arguments;e.addRequireLoadCallback(()=>e.configureRequire.apply(null,a))},e.define=function(){const a=arguments;e.addRequireLoadCallback(()=>e.define.apply(null,a))},e.preLoadFile=(...a)=>{e.addRequireLoadCallback(()=>e.preLoadFile.apply(null,a))}};r(),e._insertRequireShim=r,e.InitRequireJs=r=>{e.requireContexts={},e.performance&&null!=e.performance.now&&(requirejs.onResourceLoad=(a,r,i)=>{const l=e.performance.now(),n=e.requireContexts[a.id];if(n){r.id in n.module_callback_times||(n.module_callback_times[r.id]={});const e=n.module_callback_times[r.id];e.loadTime=l,r.url&&(e.url=r.url),r.id&&(e.name=r.id),r.parentMap&&r.parentMap.url&&(e.parent=r.parentMap.url)}}),e.configureRequire=r,e.addRequireLoadCallback=e=>e(),a.forEach(e=>e())}})();
window.CSP_SCRIPT_NONCE = "fkn3kxSC3JSGmt7VCCOe";
</script> <script async="async" crossorigin="anonymous" src="https://cfl.dropboxstatic.com/static/metaserver/static/js/alameda_bundle/alameda_bundle_ie_en-vflsCZVlq.js" type="text/javascript" nonce="zL3+lyYuZa8JCKWibZJF"></script> <link crossorigin="anonymous" href="https://cfl.dropboxstatic.com/static/metaserver/static/fonts/paper-atlasgrotesk/AtlasGrotesk-Regular-Web-vflk7bxjs.woff2" as="font" rel="preload" type="font/woff2" /><link cr
...
ok I think this works:
def download_mini_imagenet_brandos_download_from_zenodo():
"""
zeneodo link of data set: https://zenodo.org/record/7311663#.Y21EE-zMJUc
"""
from uutils import download_and_extract
download_and_extract(url='https://zenodo.org/record/7311663/files/brandoslearn2learnminiimagenet.zip',
path_used_for_zip='~/data/tmp',
path_used_for_dataset='~/data/tmp/l2l_data',
rm_zip_file=True,
)
# download_and_extract('https://zenodo.org/record/7311663/files/brandoslearn2learnminiimagenet.zip?download=1',
# '~/data/tmp', '~/data/tmp')
train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/tmp/l2l_data', mode='train', download=False)
[data for data in train_dataset]
train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/tmp/l2l_data', mode='validation', download=False)
[data for data in train_dataset]
train_dataset = l2l.vision.datasets.MiniImagenet(root='~/data/tmp/l2l_data', mode='test', download=False)
[data for data in train_dataset]
for data in train_dataset:
# print(f'{data=}')
print(f'{data[0].size()=}')
print(f'{data[1]=}')
print('success loop through local data')
def download_and_extract(url: str,
path_used_for_zip: Path = Path('~/data/'),
path_used_for_dataset: Path = Path('~/data/tmp/'),
rm_zip_file: bool = True,
gdrive_file_id: Optional[str] = None,
gdrive_filename: Optional[str] = None,
):
"""
Downloads data and tries to extract it according to different protocols/file types.
Tested with:
- zip files, yes!
Later:
- todo: tar, gz, gdrive
"""
path_used_for_zip: Path = expanduser(path_used_for_zip)
path_used_for_zip.mkdir(parents=True, exist_ok=True)
path_used_for_dataset: Path = expanduser(path_used_for_dataset)
path_used_for_dataset.mkdir(parents=True, exist_ok=True)
# - download data
if gdrive_filename is None: # not a gdrive download
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
print("downloading dataset from url: ", url)
import urllib
import http
response: http.client.HTTPResponse = urllib.request.urlopen(url, context=ctx)
print(f'{type(response)=}')
data = response
# save zipfile like data to path given
filename = url.rpartition('/')[2]
path2file: Path = path_used_for_zip / filename
else:
from torchvision.datasets.utils import download_file_from_google_drive
# if zip not there re-download it
path2file: Path = path_used_for_zip / gdrive_filename
if not path2file.exists():
download_file_from_google_drive(gdrive_file_id, path_used_for_zip, gdrive_filename)
filename = gdrive_filename
print(f'{path2file=}')
print(f'{filename=}')
if filename.endswith('.zip') or filename.endswith('.pkl'):
if not path2file.exists():
print(f'about to download data to: {path2file=}')
# wb+ is used sinze the zip file was in bytes, otherwise w+ is fine if the data is a string
with open(path2file, 'wb+') as f:
# with open(path2file, 'w+') as f:
f.write(data.read())
print(f'done downloading data to: {path2file=}')
elif filename.endswith('.gz'):
pass # the download of the data doesn't seem to be explicitly handled by me, that is done in the extract step by a magic function tarfile.open
# elif is_tar_file(filename):
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
else:
raise ValueError(f'File type {filename=} not supported.')
# - unzip
extract_to = path_used_for_dataset
print(f'about to extract: {path2file=}')
print(f'extract to target: {extract_to=}')
if filename.endswith('.zip'):
import zipfile # this one is for zip files, inspired from l2l
zip_ref = zipfile.ZipFile(path2file, 'r')
zip_ref.extractall(extract_to)
zip_ref.close()
if rm_zip_file:
path2file.unlink()
# path_2_zip_with_filename.unlink(missing_ok=True)
elif filename.endswith('.gz'):
import tarfile
file = tarfile.open(fileobj=response, mode="r|gz")
file.extractall(path=extract_to)
file.close()
elif filename.endswith('.pkl'):
# no need to extract it, but when you use the data make sure you torch.load it or pickle.load it.
print(f'about to test torch.load of: {path2file=}')
data = torch.load(path2file) # just to test
assert data is not None
print(f'{data=}')
pass
else:
raise ValueError(f'File type {filename=} not supported, edit code to support it.')
# path_2_zip_with_filename = path_2_ziplike / filename
# os.system(f'tar -xvzf {path_2_zip_with_filename} -C {path_2_dataset}/')
# if rm_zip_file:
# path_2_zip_with_filename.unlink()
# # path_2_zip_with_filename.unlink(missing_ok=True)
# # raise ValueError(f'File type {filename=} not supported.')
print(f'done extracting: {path2file=}')
print(f'extracted at location:{path_used_for_dataset=}')
you cna go to my ultimate utils library to get the most recent verison, it's on pypi too. Data set link: https://zenodo.org/record/7311663#.Y21EE-zMJUc
@seba-1511 may I get help fixing this?
now even though I re-download it from the link I made it always has an issue loading the data. So my script tries to torch.load the pkl files but fails and ends up deciding to re-download it...
Do I need to open both the booking file and the other one?
code:
def should_we_redownload_mi_data_set(root: Union[str, Path]) -> bool:
"""
If any of the pickle files is missing or loading the pickle data returned an error,
return True i.e. yes you need to redownload the data a file is missing or there is a corrupted file.
mini-imagenet-bookkeeping-{split}.pkl
mini-imagenet-cache-{split}.pkl
"""
root: Path = expanduser(root)
splits: list[str] = ['train', 'validation', 'test']
# filenames: list[str] = [f'mini-imagenet-bookkeeping-{split}.pkl', f'mini-imagenet-cache-{split}.pkl']
# for filename in filenames:
for split in splits:
# -
filename1: str = f'mini-imagenet-bookkeeping-{split}.pkl'
path2file: Path = root / filename1
if not path2file.exists():
print(f'This file does NOT exist :{path2file=}, so we are redownloading MI')
return True
if not succeeded_opening_pkl_data_mi(path2file):
return True
# -
filename2: str = f'mini-imagenet-cache-{split}.pkl'
path2file: Path = root / filename2
if not path2file.exists():
print(f'This file does NOT exist :{path2file=}, so we are redownloading MI')
return True
if not succeeded_opening_pkl_data_mi(path2file):
return True
return False
def succeeded_opening_pkl_data_mi(path2file: Union[str, Path]) -> bool:
path2file: Path = expanduser(path2file)
try:
data = torch.load(path2file)
assert data is not None, f'Err: {data=}'
except Exception as e:
import logging
print(f'Was not able to open the l2l data with torch.load, got error: {e=}')
logging.warning(e)
return False
return True
When I use the official code train_dataset = l2l.vision.datasets.MiniImagenet(root='./data', mode='train')
, I get the error FileNotFoundError: [Errno 2] No such file or directory: './data/mini-imagenet-cache-train.pkl'
. Currently, I can resolve this issue by manually downloading the file from Google Drive or Dropbox.
However, upon checking the source code (https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/datasets/mini_imagenet.py#L87), I noticed that the download_pkl()
function in the class MiniImagenet(data.Dataset)
takes the following Google Drive file IDs as input: google_drive_file_id = '1wpmY-hmiJUUlRBkO9ZDCXAcIpHEFdOhD'
google_drive_file_id = '1I3itTXpXxGV68olxM5roceUMG8itH9Xj'
google_drive_file_id = '1KY5e491bkLFqJDp0-UWou3463Mo8AOco'
. However, when I try to use gdown in Colab to download these files, I get the error "Access denied with the following error: Cannot retrieve the public link of the file. You may need to change the permission to 'Anyone with the link', or have had many accesses."
Therefore, I believe that the issue may actually be with the permissions on Google Drive, which need to be set to "Anyone with the link" in order for the download_pkl()
function to work properly.
The current bug causing the error is that the function download_file_from_google_drive()
does not check for any error messages in the response
, so the program assumes that the file has been successfully downloaded, when in fact nothing has been downloaded.
Did u check my alternative way to download the data?
I wish l2l had an official solution
On Thu, Dec 29, 2022, 7:28 AM Kuihao @.***> wrote:
When I use the official code train_dataset = l2l.vision.datasets.MiniImagenet(root='./data', mode='train'), I get the error FileNotFoundError: [Errno 2] No such file or directory: './data/mini-imagenet-cache-train.pkl'. Currently, I can resolve this issue by manually downloading the file from Google Drive or Dropbox.
- test:
- train:
- valid:
However, upon checking the source code ( https://github.com/learnables/learn2learn/blob/master/learn2learn/vision/datasets/mini_imagenet.py#L87), I noticed that the download_pkl function in the class MiniImagenet(data.Dataset) takes the following Google Drive file IDs as input: google_drive_file_id = '1wpmY-hmiJUUlRBkO9ZDCXAcIpHEFdOhD' google_drive_file_id = '1I3itTXpXxGV68olxM5roceUMG8itH9Xj' google_drive_file_id = '1KY5e491bkLFqJDp0-UWou3463Mo8AOco'. However, when I try to use gdown in Colab to download these files, I get the error "Access denied with the following error: Cannot retrieve the public link of the file. You may need to change the permission to 'Anyone with the link', or have had many accesses." Therefore, I believe that the issue may actually be with the permissions on Google Drive, which need to be set to "Anyone with the link" in order for the download_pkl() function to work properly.
The current bug causing the error is that the function download_file_from_google_drive() does not check for any error messages in the response, so the program assumes that the file has been successfully downloaded, when in fact nothing has been downloaded.
— Reply to this email directly, view it on GitHub https://github.com/learnables/learn2learn/issues/310#issuecomment-1367319195, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAOE6LX7JAI2P75MEXI7DG3WPWGX7ANCNFSM5OHKIMLA . You are receiving this because you were mentioned.Message ID: @.***>
These links are from the official source code, which is a quick and effective way to solve file download problems. As I mentioned above, the official code itself is flawed and does not check the https response
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params={'id': id}, stream=True)
token = get_confirm_token(response)
if token:
params = {'id': id, 'confirm': token}
response = session.get(URL, params=params, stream=True) # <--- bug is here
save_response_content(response, destination)
this resulting in a false error detection in the main source code that does not work.
pickle_file = os.path.join(self.root, 'mini-imagenet-cache-' + mode + '.pkl')
try:
if not self._check_exists() and download:
print('Downloading mini-ImageNet --', mode)
download_pkl(google_drive_file_id, self.root, mode)
with open(pickle_file, 'rb') as f:
self.data = pickle.load(f)
except pickle.UnpicklingError: # <--- Exception handling cannot be triggered!! Can not detect the permission error (no http response)
if not self._check_exists() and download:
print('Download failed. Re-trying mini-ImageNet --', mode)
download_file(dropbox_file_link, pickle_file)
with open(pickle_file, 'rb') as f:
self.data = pickle.load(f)
Did u check my alternative way to download the data? I wish l2l had an official solution
The code will download a temporary .pkl
file for you. Then I choose to download manually.
Closing: will be fixed as soon as we merge #400.
Somehow I get the following error:
but I only get it in the dgx machine not locally...has anyone had this error before?
cross: https://stackoverflow.com/questions/71094907/how-does-one-download-data-for-a-dgx-a100-machine-that-returns-a-invalid-load-ke