The problem arises in GitHub code for chapter 2, Text Classification.
Describe the bug
TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'
To Reproduce
I ran 02_classification.ipynb in SageMaker Studio Lab and got an error in cell 15.
I've copied the traceback below at the end of this issue.
I think the crux is this:
TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'
This issue seems relevant:
"[BUG] With Pandas 2.0.0, load_dataset raises TypeError: read_csv() got an
unexpected keyword argument 'mangle_dupe_cols'"
https://github.com/huggingface/datasets/issues/5744
lhoestq commented on Apr 21
"This has been fixed in datasets 2.11."
I found that if I change line 42 in my copy of install.py to
transformers_cmd = "python -m pip install transformers==4.13.0 datasets==2.11.0".split()
then I don't get the error, though I don't know whether it would be better to use a
higher version of datasets or whether other notebooks may get the same error.
Cheers,
Ken Hoffman
TypeError Traceback (most recent call last)
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1816 time = time.time()
-> 1817 for , table in generator:
1818 if max_shard_size is not None and writer._num_bytes > max_shard_size:
~/.conda/envs/default/lib/python3.9/site-packages/datasets/packaged_modules/csv/csv.py in _generate_tables(self, files)
176 for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
--> 177 csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.pd_read_csv_kwargs)
178 try:
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in download_and_prepare(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
858 if num_proc is not None:
859 prepare_split_kwargs["num_proc"] = num_proc
--> 860 self._download_and_prepare(
861 dl_manager=dl_manager,
862 verify_infos=verify_infos,
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _download_and_prepare(self, dl_manager, verify_infos, prepare_split_kwargs)
951 try:
952 # Prepare split will record examples associated to the split
--> 953 self._prepare_split(split_generator, prepare_split_kwargs)
954 except OSError as e:
955 raise OSError(
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
1704 gen_kwargs = split_generator.gen_kwargs
1705 job_id = 0
-> 1706 for job_id, done, content in self._prepare_split_single(
1707 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1708 ):
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1847 if isinstance(e, SchemaInferenceError) and e.context is not None:
1848 e = e.context
-> 1849 raise DatasetGenerationError("An error occurred while generating the dataset") from e
1850
1851 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset
Information
The problem arises in GitHub code for chapter 2, Text Classification.
Describe the bug
TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'
To Reproduce
I ran 02_classification.ipynb in SageMaker Studio Lab and got an error in cell 15. I've copied the traceback below at the end of this issue. I think the crux is this: TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'
This issue seems relevant: "[BUG] With Pandas 2.0.0, load_dataset raises TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'" https://github.com/huggingface/datasets/issues/5744 lhoestq commented on Apr 21 "This has been fixed in datasets 2.11."
In https://github.com/nlp-with-transformers/notebooks/blob/main/install.py lines 41-7 are:
I found that if I change line 42 in my copy of install.py to transformers_cmd = "python -m pip install transformers==4.13.0 datasets==2.11.0".split() then I don't get the error, though I don't know whether it would be better to use a higher version of datasets or whether other notebooks may get the same error.
Cheers, Ken Hoffman
TypeError Traceback (most recent call last) ~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id) 1816 time = time.time() -> 1817 for , table in generator: 1818 if max_shard_size is not None and writer._num_bytes > max_shard_size:
~/.conda/envs/default/lib/python3.9/site-packages/datasets/packaged_modules/csv/csv.py in _generate_tables(self, files) 176 for file_idx, file in enumerate(itertools.chain.from_iterable(files)): --> 177 csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.pd_read_csv_kwargs) 178 try:
~/.conda/envs/default/lib/python3.9/site-packages/datasets/streaming.py in wrapper(*args, kwargs) 68 def wrapper(*args, *kwargs): ---> 69 return function(args, use_auth_token=use_auth_token, kwargs) 70
~/.conda/envs/default/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py in xpandas_read_csv(filepath_or_buffer, use_auth_token, kwargs) 726 kwargs["compression"] = _get_extraction_protocol(filepath_or_buffer, use_auth_token=use_auth_token) --> 727 return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), kwargs) 728
TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'
The above exception was the direct cause of the following exception:
DatasetGenerationError Traceback (most recent call last) /tmp/ipykernel_173/3318475519.py in <cell line: 2>() 1 #hide_output ----> 2 emotions_local = load_dataset("csv", data_files="train.txt", sep=";", 3 names=["text", "label"])
~/.conda/envs/default/lib/python3.9/site-packages/datasets/load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs) 1755 1756 # Download and prepare data -> 1757 builder_instance.download_and_prepare( 1758 download_config=download_config, 1759 download_mode=download_mode,
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in download_and_prepare(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs) 858 if num_proc is not None: 859 prepare_split_kwargs["num_proc"] = num_proc --> 860 self._download_and_prepare( 861 dl_manager=dl_manager, 862 verify_infos=verify_infos,
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _download_and_prepare(self, dl_manager, verify_infos, prepare_split_kwargs) 951 try: 952 # Prepare split will record examples associated to the split --> 953 self._prepare_split(split_generator, prepare_split_kwargs) 954 except OSError as e: 955 raise OSError(
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _prepare_split(self, split_generator, file_format, num_proc, max_shard_size) 1704 gen_kwargs = split_generator.gen_kwargs 1705 job_id = 0 -> 1706 for job_id, done, content in self._prepare_split_single( 1707 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args 1708 ):
~/.conda/envs/default/lib/python3.9/site-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id) 1847 if isinstance(e, SchemaInferenceError) and e.context is not None: 1848 e = e.context -> 1849 raise DatasetGenerationError("An error occurred while generating the dataset") from e 1850 1851 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset
Expected behavior
Cell 15 should run without error.