epfLLM / meditron

Meditron is a suite of open-source medical Large Language Models (LLMs).
https://huggingface.co/epfl-llm
Apache License 2.0
1.77k stars 159 forks source link

Loading the guidelines with huggingface datasets fails #6

Closed paulhager closed 7 months ago

paulhager commented 7 months ago

Running the following code

from datasets import load_dataset

dataset = load_dataset("epfl-llm/guidelines")

Gives me this error:

{
    "name": "DatasetGenerationError",
    "message": "An error occurred while generating the dataset",
    "stack": "---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/builder.py:1932, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1925     writer = writer_class(
   1926         features=writer._features,
   1927         path=fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),
   (...)
   1930         embed_local_files=embed_local_files,
   1931     )
-> 1932 writer.write_table(table)
   1933 num_examples_progress_update += len(table)

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/arrow_writer.py:573, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
    572 pa_table = pa_table.combine_chunks()
--> 573 pa_table = table_cast(pa_table, self._schema)
    574 if self.embed_local_files:

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:2332, in table_cast(table, schema)
   2331 if table.schema != schema:
-> 2332     return cast_table_to_schema(table, schema)
   2333 elif table.schema.metadata != schema.metadata:

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:2291, in cast_table_to_schema(table, schema)
   2290     raise ValueError(f\"Couldn't cast\
{table.schema}\
to\
{features}\
because column names don't match\")
-> 2291 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
   2292 return pa.Table.from_arrays(arrays, schema=schema)

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:2291, in <listcomp>(.0)
   2290     raise ValueError(f\"Couldn't cast\
{table.schema}\
to\
{features}\
because column names don't match\")
-> 2291 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
   2292 return pa.Table.from_arrays(arrays, schema=schema)

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:1834, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1833 if isinstance(array, pa.ChunkedArray):
-> 1834     return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1835 else:

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:1834, in <listcomp>(.0)
   1833 if isinstance(array, pa.ChunkedArray):
-> 1834     return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1835 else:

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:2147, in cast_array_to_feature(array, feature, allow_number_to_str)
   2146 elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2147     return array_cast(array, feature(), allow_number_to_str=allow_number_to_str)
   2148 raise TypeError(f\"Couldn't cast array of type\
{array.type}\
to\
{feature}\")

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:1836, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1835 else:
-> 1836     return func(array, *args, **kwargs)

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/table.py:2029, in array_cast(array, pa_type, allow_number_to_str)
   2028 if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
-> 2029     raise TypeError(f\"Couldn't cast array of type {array.type} to {pa_type}\")
   2030 return array.cast(pa_type)

TypeError: Couldn't cast array of type string to null

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
model_playground.ipynb Cell 6 line 3
      model_playground.ipynb#W5sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a> dataset = load_dataset(\"epfl-llm/guidelines\")

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/load.py:2152, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   2149 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   2151 # Download and prepare data
-> 2152 builder_instance.download_and_prepare(
   2153     download_config=download_config,
   2154     download_mode=download_mode,
   2155     verification_mode=verification_mode,
   2156     try_from_hf_gcs=try_from_hf_gcs,
   2157     num_proc=num_proc,
   2158     storage_options=storage_options,
   2159 )
   2161 # Build dataset for splits
   2162 keep_in_memory = (
   2163     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   2164 )

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/builder.py:948, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    946     if num_proc is not None:
    947         prepare_split_kwargs[\"num_proc\"] = num_proc
--> 948     self._download_and_prepare(
    949         dl_manager=dl_manager,
    950         verification_mode=verification_mode,
    951         **prepare_split_kwargs,
    952         **download_and_prepare_kwargs,
    953     )
    954 # Sync info
    955 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/builder.py:1043, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
   1039 split_dict.add(split_generator.split_info)
   1041 try:
   1042     # Prepare split will record examples associated to the split
-> 1043     self._prepare_split(split_generator, **prepare_split_kwargs)
   1044 except OSError as e:
   1045     raise OSError(
   1046         \"Cannot find data file. \"
   1047         + (self.manual_download_instructions or \"\")
   1048         + \"\
Original error:\
\"
   1049         + str(e)
   1050     ) from None

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/builder.py:1805, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1803 job_id = 0
   1804 with pbar:
-> 1805     for job_id, done, content in self._prepare_split_single(
   1806         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1807     ):
   1808         if done:
   1809             result = content

File ~/miniconda3/envs/llm/lib/python3.10/site-packages/datasets/builder.py:1950, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1948     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1949         e = e.__context__
-> 1950     raise DatasetGenerationError(\"An error occurred while generating the dataset\") from e
   1952 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset"
}
AGBonnet commented 7 months ago

Hello Paul,

Thank you so much for finding this bug.

I just added feature types for each column manually and it seems to have fixed the issue (and the dataset viewer).

Let me know if it works on your end. If so, I'll close the issue.

Cheers!

paulhager commented 7 months ago

Works now, thanks for the quick fix!