Wen trying to load dataset using
dset = load_dataset('astroclip/datasets/legacy_survey.py'
Iam getting the following error. Can you guide what should be done to address that?
No config specified, defaulting to: legacy_survey/joint Downloading and preparing dataset legacy_survey/joint to file:///home/anb5km/.cache/huggingface/datasets/legacy_survey/joint/1.1.5/23c8686e9273427201364d6e71df76a96e9741339dc3edb17509aaad8af14294... Generating train split: 986 examples [00:26, 41.95 examples/s]/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/features/features.py:641: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism. pa.PyExtensionType.__init__(self, self.storage_dtype) /home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:405: RuntimeWarning: pickle-based deserialization of pyarrow.PyExtensionType subclasses is disabled by default; if you only ingest trusted data files, you may re-enable this usingpyarrow.PyExtensionType.set_auto_load(True)`.
In the future, Python-defined extension subclasses should derive from pyarrow.ExtensionType (not pyarrow.PyExtensionType) and implement their own serialization mechanism.
else (pa.schema(self._features.type) if self._features is not None else None)
/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:405: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism.
else (pa.schema(self._features.type) if self._features is not None else None)
/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/features/features.py:1599: RuntimeWarning: pickle-based deserialization of pyarrow.PyExtensionType subclasses is disabled by default; if you only ingest trusted data files, you may re-enable this using pyarrow.PyExtensionType.set_auto_load(True).
In the future, Python-defined extension subclasses should derive from pyarrow.ExtensionType (not pyarrow.PyExtensionType) and implement their own serialization mechanism.
return pa.schema(self.type).with_metadata({"huggingface": json.dumps(hf_metadata)})
/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/features/features.py:1599: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism.
return pa.schema(self.type).with_metadata({"huggingface": json.dumps(hf_metadata)})
/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:554: RuntimeWarning: pickle-based deserialization of pyarrow.PyExtensionType subclasses is disabled by default; if you only ingest trusted data files, you may re-enable this using pyarrow.PyExtensionType.set_auto_load(True).
In the future, Python-defined extension subclasses should derive from pyarrow.ExtensionType (not pyarrow.PyExtensionType) and implement their own serialization mechanism.
pa_table = pa.Table.from_arrays(arrays, schema=schema)
/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:554: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism.
pa_table = pa.Table.from_arrays(arrays, schema=schema)
Generating train split: 999 examples [00:36, 4.31 examples/s]Traceback (most recent call last):
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1589, in _prepare_split_single
writer.write(example, key)
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 488, in write
self.write_examples_on_file()
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 446, in write_examples_on_file
self.write_batch(batch_examples=batch_examples)
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 554, in write_batch
pa_table = pa.Table.from_arrays(arrays, schema=schema)
File "pyarrow/table.pxi", line 3969, in pyarrow.lib.Table.from_arrays
File "pyarrow/table.pxi", line 1463, in pyarrow.lib._sanitize_arrays
File "pyarrow/array.pxi", line 371, in pyarrow.lib.asarray
File "pyarrow/array.pxi", line 997, in pyarrow.lib.Array.cast
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/pyarrow/compute.py", line 404, in cast
return call_function("cast", [arr], options, memory_pool)
File "pyarrow/_compute.pyx", line 590, in pyarrow._compute.call_function
File "pyarrow/_compute.pyx", line 385, in pyarrow._compute.Function.call
File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: Casting from 'extension<arrow.py_extension_type>' to different extension type 'extension<arrow.py_extension_type>' not permitted. One can first cast to the storage type, then to the extension type.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1598, in _prepare_split_single
num_examples, num_bytes = writer.finalize()
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 581, in finalize
self.write_examples_on_file()
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 446, in write_examples_on_file
self.write_batch(batch_examples=batch_examples)
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 554, in write_batch
pa_table = pa.Table.from_arrays(arrays, schema=schema)
File "pyarrow/table.pxi", line 3969, in pyarrow.lib.Table.from_arrays
File "pyarrow/table.pxi", line 1463, in pyarrow.lib._sanitize_arrays
File "pyarrow/array.pxi", line 371, in pyarrow.lib.asarray
File "pyarrow/array.pxi", line 997, in pyarrow.lib.Array.cast
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/pyarrow/compute.py", line 404, in cast
return call_function("cast", [arr], options, memory_pool)
File "pyarrow/_compute.pyx", line 590, in pyarrow._compute.call_function
File "pyarrow/_compute.pyx", line 385, in pyarrow._compute.Function.call
File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: Casting from 'extension<arrow.py_extension_type>' to different extension type 'extension<arrow.py_extension_type>' not permitted. One can first cast to the storage type, then to the extension type.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "", line 1, in
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/load.py", line 1758, in load_dataset
builder_instance.download_and_prepare(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 860, in download_and_prepare
self._download_and_prepare(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1612, in _download_and_prepare
super()._download_and_prepare(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 953, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1450, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1607, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.builder.DatasetGenerationError: An error occurred while generating the dataset`
Wen trying to load dataset using
dset = load_dataset('astroclip/datasets/legacy_survey.py'
Iam getting the following error. Can you guide what should be done to address that?No config specified, defaulting to: legacy_survey/joint Downloading and preparing dataset legacy_survey/joint to file:///home/anb5km/.cache/huggingface/datasets/legacy_survey/joint/1.1.5/23c8686e9273427201364d6e71df76a96e9741339dc3edb17509aaad8af14294... Generating train split: 986 examples [00:26, 41.95 examples/s]/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/features/features.py:641: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism. pa.PyExtensionType.__init__(self, self.storage_dtype) /home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:405: RuntimeWarning: pickle-based deserialization of pyarrow.PyExtensionType subclasses is disabled by default; if you only ingest trusted data files, you may re-enable this using
pyarrow.PyExtensionType.set_auto_load(True)`. In the future, Python-defined extension subclasses should derive from pyarrow.ExtensionType (not pyarrow.PyExtensionType) and implement their own serialization mechanism.else (pa.schema(self._features.type) if self._features is not None else None) /home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:405: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism. else (pa.schema(self._features.type) if self._features is not None else None) /home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/features/features.py:1599: RuntimeWarning: pickle-based deserialization of pyarrow.PyExtensionType subclasses is disabled by default; if you only ingest trusted data files, you may re-enable this using
pyarrow.PyExtensionType.set_auto_load(True)
. In the future, Python-defined extension subclasses should derive from pyarrow.ExtensionType (not pyarrow.PyExtensionType) and implement their own serialization mechanism.return pa.schema(self.type).with_metadata({"huggingface": json.dumps(hf_metadata)}) /home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/features/features.py:1599: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism. return pa.schema(self.type).with_metadata({"huggingface": json.dumps(hf_metadata)}) /home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:554: RuntimeWarning: pickle-based deserialization of pyarrow.PyExtensionType subclasses is disabled by default; if you only ingest trusted data files, you may re-enable this using
pyarrow.PyExtensionType.set_auto_load(True)
. In the future, Python-defined extension subclasses should derive from pyarrow.ExtensionType (not pyarrow.PyExtensionType) and implement their own serialization mechanism.pa_table = pa.Table.from_arrays(arrays, schema=schema) /home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py:554: FutureWarning: pyarrow.PyExtensionType is deprecated and will refuse deserialization by default. Instead, please derive from pyarrow.ExtensionType and implement your own serialization mechanism. pa_table = pa.Table.from_arrays(arrays, schema=schema) Generating train split: 999 examples [00:36, 4.31 examples/s]Traceback (most recent call last): File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1589, in _prepare_split_single writer.write(example, key) File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 488, in write self.write_examples_on_file() File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 446, in write_examples_on_file self.write_batch(batch_examples=batch_examples) File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 554, in write_batch pa_table = pa.Table.from_arrays(arrays, schema=schema) File "pyarrow/table.pxi", line 3969, in pyarrow.lib.Table.from_arrays File "pyarrow/table.pxi", line 1463, in pyarrow.lib._sanitize_arrays File "pyarrow/array.pxi", line 371, in pyarrow.lib.asarray File "pyarrow/array.pxi", line 997, in pyarrow.lib.Array.cast File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/pyarrow/compute.py", line 404, in cast return call_function("cast", [arr], options, memory_pool) File "pyarrow/_compute.pyx", line 590, in pyarrow._compute.call_function File "pyarrow/_compute.pyx", line 385, in pyarrow._compute.Function.call File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowTypeError: Casting from 'extension<arrow.py_extension_type>' to different extension type 'extension<arrow.py_extension_type>' not permitted. One can first cast to the storage type, then to the extension type.
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1598, in _prepare_split_single num_examples, num_bytes = writer.finalize() File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 581, in finalize self.write_examples_on_file() File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 446, in write_examples_on_file self.write_batch(batch_examples=batch_examples) File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/arrow_writer.py", line 554, in write_batch pa_table = pa.Table.from_arrays(arrays, schema=schema) File "pyarrow/table.pxi", line 3969, in pyarrow.lib.Table.from_arrays File "pyarrow/table.pxi", line 1463, in pyarrow.lib._sanitize_arrays File "pyarrow/array.pxi", line 371, in pyarrow.lib.asarray File "pyarrow/array.pxi", line 997, in pyarrow.lib.Array.cast File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/pyarrow/compute.py", line 404, in cast return call_function("cast", [arr], options, memory_pool) File "pyarrow/_compute.pyx", line 590, in pyarrow._compute.call_function File "pyarrow/_compute.pyx", line 385, in pyarrow._compute.Function.call File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowTypeError: Casting from 'extension<arrow.py_extension_type>' to different extension type 'extension<arrow.py_extension_type>' not permitted. One can first cast to the storage type, then to the extension type.
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "", line 1, in
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/load.py", line 1758, in load_dataset
builder_instance.download_and_prepare(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 860, in download_and_prepare
self._download_and_prepare(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1612, in _download_and_prepare
super()._download_and_prepare(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 953, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1450, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/home/anb5km/.conda/envs/fbr/lib/python3.9/site-packages/datasets/builder.py", line 1607, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.builder.DatasetGenerationError: An error occurred while generating the dataset`