huggingface / dataset-viewer

Backend that powers the dataset viewer on Hugging Face dataset pages through a public API.
https://huggingface.co/docs/dataset-viewer
Apache License 2.0
701 stars 77 forks source link

Better error code/message for badly formatted Parquet files #2625

Open severo opened 8 months ago

severo commented 8 months ago

See https://huggingface.co/datasets/PleIAs/Dutch-PD/discussions/1

Indeed it seems that the Parquet files don't share the same type for the publication_date field: either int64 or string. Some files like dutch_pd_8.parquet even contain only empty strings as publication_date. This is causing an error in the Dataset Viewer because it requires all the files to have the same types.

The current error is:

{
    "error": "An error occurred while generating the dataset",
    "cause_exception": "DatasetGenerationError",
    "cause_message": "An error occurred while generating the dataset",
    "cause_traceback": [
        "Traceback (most recent call last):\n",
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 1264, in compute_config_parquet_and_info_response\n fill_builder_info(builder, hf_endpoint=hf_endpoint, hf_token=hf_token, validate=validate)\n',
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 696, in fill_builder_info\n ) = retry_validate_get_features_num_examples_size_and_compression_ratio(\n',
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 618, in retry_validate_get_features_num_examples_size_and_compression_ratio\n validate(pf)\n',
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 656, in validate\n raise TooBigRowGroupsError(\n',
        "worker.job_runners.config.parquet_and_info.TooBigRowGroupsError: Parquet file has too big row groups. First row group has 1474264638 which exceeds the limit of 300000000\n",
        "\nDuring handling of the above exception, another exception occurred:\n\n",
        "Traceback (most recent call last):\n",
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py", line 1973, in _prepare_split_single\n for _, table in generator:\n',
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 805, in wrapped\n for item in generator(*args, **kwargs):\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/packaged_modules/parquet/parquet.py", line 96, in _generate_tables\n yield f"{file_idx}_{batch_idx}", self._cast_table(pa_table)\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/packaged_modules/parquet/parquet.py", line 74, in _cast_table\n pa_table = table_cast(pa_table, self.info.features.arrow_schema)\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 2240, in table_cast\n return cast_table_to_schema(table, schema)\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 2199, in cast_table_to_schema\n arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 2199, in <listcomp>\n arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 1793, in wrapper\n return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 1793, in <listcomp>\n return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 2065, in cast_array_to_feature\n return array_cast(array, feature(), allow_number_to_str=allow_number_to_str)\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 1795, in wrapper\n return func(array, *args, **kwargs)\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/table.py", line 1937, in array_cast\n return array.cast(pa_type)\n',
        ' File "pyarrow/array.pxi", line 997, in pyarrow.lib.Array.cast\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/pyarrow/compute.py", line 404, in cast\n return call_function("cast", [arr], options, memory_pool)\n',
        ' File "pyarrow/_compute.pyx", line 590, in pyarrow._compute.call_function\n',
        ' File "pyarrow/_compute.pyx", line 385, in pyarrow._compute.Function.call\n',
        ' File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status\n',
        ' File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status\n',
        "pyarrow.lib.ArrowInvalid: Failed to parse string: '' as a scalar of type int64\n",
        "\nThe above exception was the direct cause of the following exception:\n\n",
        "Traceback (most recent call last):\n",
        ' File "/src/services/worker/src/worker/job_manager.py", line 125, in process\n job_result = self.job_runner.compute()\n',
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 1393, in compute\n compute_config_parquet_and_info_response(\n',
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 1277, in compute_config_parquet_and_info_response\n parquet_operations, partial = stream_convert_to_parquet(\n',
        ' File "/src/services/worker/src/worker/job_runners/config/parquet_and_info.py", line 896, in stream_convert_to_parquet\n builder._prepare_split(\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py", line 1860, in _prepare_split\n for job_id, done, content in self._prepare_split_single(\n',
        ' File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py", line 2016, in _prepare_split_single\n raise DatasetGenerationError("An error occurred while generating the dataset") from e\n',
        "datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset\n",
    ],
}
severo commented 8 months ago

We don't give any of these info to the user:

Capture d’écran 2024-03-22 à 18 52 57