Cannot load the dataset split (in streaming mode) to extract the first rows.
Error code: StreamingRowsError
Exception: KeyError
Message: 'jpeg'
Traceback: Traceback (most recent call last):
File "/src/services/worker/src/worker/job_runners/split/first_rows.py", line 322, in compute
compute_first_rows_from_parquet_response(
File "/src/services/worker/src/worker/job_runners/split/first_rows.py", line 88, in compute_first_rows_from_parquet_response
rows_index = indexer.get_rows_index(
File "/src/libs/libcommon/src/libcommon/parquet_utils.py", line 640, in get_rows_index
return RowsIndex(
File "/src/libs/libcommon/src/libcommon/parquet_utils.py", line 521, in __init__
self.parquet_index = self._init_parquet_index(
File "/src/libs/libcommon/src/libcommon/parquet_utils.py", line 538, in _init_parquet_index
response = get_previous_step_or_raise(
File "/src/libs/libcommon/src/libcommon/simple_cache.py", line 591, in get_previous_step_or_raise
raise CachedArtifactError(
libcommon.simple_cache.CachedArtifactError: The previous step failed.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/src/services/worker/src/worker/utils.py", line 96, in get_rows_or_raise
return get_rows(
File "/src/libs/libcommon/src/libcommon/utils.py", line 197, in decorator
return func(*args, **kwargs)
File "/src/services/worker/src/worker/utils.py", line 73, in get_rows
rows_plus_one = list(itertools.islice(ds, rows_max_number + 1))
File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py", line 1816, in __iter__
for key, example in ex_iterable:
File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py", line 238, in __iter__
for key_example in islice(self.generate_examples_fn(**gen_kwags), shard_example_idx_start, None):
File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/packaged_modules/webdataset/webdataset.py", line 115, in _generate_examples
example[field_name] = {"path": example["__key__"] + "." + field_name, "bytes": example[field_name]}
KeyError: 'jpeg'
For example, currently on https://huggingface.co/datasets/ProGamerGov/synthetic-dataset-1m-dalle3-high-quality-captions