huggingface / transformers

🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.
https://huggingface.co/transformers
Apache License 2.0
128.43k stars 25.47k forks source link

Problem with the masked language modeling tutorial #31545

Open YaBoyBigPat opened 1 week ago

YaBoyBigPat commented 1 week ago

I keep getting an error when following the masked language model tutorial using the following code as instructed.

from huggingface_hub import notebook_login notebook_login()

from datasets import load_dataset

eli5 = load_dataset("eli5_category", split="train[:5000]")

eli5["train"][0]

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

eli5 = eli5.flatten() eli5["train"][0]

def preprocess_function(examples): return tokenizer([" ".join(x) for x in examples["answers.text"]])

tokenized_eli5 = eli5.map( preprocess_function, batched=True, num_proc=4, remove_columns=eli5["train"].column_names, )

Error:

{ "name": "NameError", "message": "name 'tokenizer' is not defined", "stack": "--------------------------------------------------------------------------- RemoteTraceback Traceback (most recent call last) RemoteTraceback: \"\"\" Traceback (most recent call last): File \"c:\Users\ljjx\HFModels\.venv\Lib\site-packages\multiprocess\pool.py\", line 125, in worker result = (True, func(args, kwds)) ^^^^^^^^^^^^^^^^^^^ File \"c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\utils\py_utils.py\", line 678, in _write_generator_to_queue for i, result in enumerate(func(kwargs)): File \"c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\arrow_dataset.py\", line 3552, in _map_single batch = apply_function_on_filtered_inputs( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File \"c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\arrow_dataset.py\", line 3421, in apply_function_on_filtered_inputs processed_inputs = function(fn_args, *additional_args, **fn_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File \"C:\Users\ljjx\AppData\Local\Temp\ipykernel_36716\2763327562.py\", line 2, in preprocess_function NameError: name 'tokenizer' is not defined \"\"\"

The above exception was the direct cause of the following exception:

NameError Traceback (most recent call last) Cell In[10], line 1 ----> 1 tokenized_eli5 = eli5.map( 2 preprocess_function, 3 batched=True, 4 num_proc=4, 5 remove_columns=eli5[\"train\"].column_names, 6 )

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\dataset_dict.py:869, in DatasetDict.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc) 866 if cache_file_names is None: 867 cache_file_names = {k: None for k in self} 868 return DatasetDict( --> 869 { 870 k: dataset.map( 871 function=function, 872 with_indices=with_indices, 873 with_rank=with_rank, 874 input_columns=input_columns, 875 batched=batched, 876 batch_size=batch_size, 877 drop_last_batch=drop_last_batch, 878 remove_columns=remove_columns, 879 keep_in_memory=keep_in_memory, 880 load_from_cache_file=load_from_cache_file, 881 cache_file_name=cache_file_names[k], 882 writer_batch_size=writer_batch_size, 883 features=features, 884 disable_nullable=disable_nullable, 885 fn_kwargs=fn_kwargs, 886 num_proc=num_proc, 887 desc=desc, 888 ) 889 for k, dataset in self.items() 890 } 891 )

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\dataset_dict.py:870, in (.0) 866 if cache_file_names is None: 867 cache_file_names = {k: None for k in self} 868 return DatasetDict( 869 { --> 870 k: dataset.map( 871 function=function, 872 with_indices=with_indices, 873 with_rank=with_rank, 874 input_columns=input_columns, 875 batched=batched, 876 batch_size=batch_size, 877 drop_last_batch=drop_last_batch, 878 remove_columns=remove_columns, 879 keep_in_memory=keep_in_memory, 880 load_from_cache_file=load_from_cache_file, 881 cache_file_name=cache_file_names[k], 882 writer_batch_size=writer_batch_size, 883 features=features, 884 disable_nullable=disable_nullable, 885 fn_kwargs=fn_kwargs, 886 num_proc=num_proc, 887 desc=desc, 888 ) 889 for k, dataset in self.items() 890 } 891 )

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\arrow_dataset.py:602, in transmit_tasks..wrapper(*args, *kwargs) 600 self: \"Dataset\" = kwargs.pop(\"self\") 601 # apply actual function --> 602 out: Union[\"Dataset\", \"DatasetDict\"] = func(self, args, **kwargs) 603 datasets: List[\"Dataset\"] = list(out.values()) if isinstance(out, dict) else [out] 604 for dataset in datasets: 605 # Remove task templates if a column mapping of the template is no longer valid

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\arrow_dataset.py:567, in transmit_format..wrapper(*args, *kwargs) 560 self_format = { 561 \"type\": self._format_type, 562 \"format_kwargs\": self._format_kwargs, 563 \"columns\": self._format_columns, 564 \"output_all_columns\": self._output_all_columns, 565 } 566 # apply actual function --> 567 out: Union[\"Dataset\", \"DatasetDict\"] = func(self, args, **kwargs) 568 datasets: List[\"Dataset\"] = list(out.values()) if isinstance(out, dict) else [out] 569 # re-apply format to the output

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\arrow_dataset.py:3253, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc) 3247 logger.info(f\"Spawning {num_proc} processes\") 3248 with hf_tqdm( 3249 unit=\" examples\", 3250 total=pbar_total, 3251 desc=(desc or \"Map\") + f\" (num_proc={num_proc})\", 3252 ) as pbar: -> 3253 for rank, done, content in iflatmap_unordered( 3254 pool, Dataset._map_single, kwargs_iterable=kwargs_per_job 3255 ): 3256 if done: 3257 shards_done += 1

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\utils\py_utils.py:718, in iflatmap_unordered(pool, func, kwargs_iterable) 715 finally: 716 if not pool_changed: 717 # we get the result in case there's an error to raise --> 718 [async_result.get(timeout=0.05) for async_result in async_results]

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\datasets\utils\py_utils.py:718, in (.0) 715 finally: 716 if not pool_changed: 717 # we get the result in case there's an error to raise --> 718 [async_result.get(timeout=0.05) for async_result in async_results]

File c:\Users\ljjx\HFModels\.venv\Lib\site-packages\multiprocess\pool.py:774, in ApplyResult.get(self, timeout) 772 return self._value 773 else: --> 774 raise self._value

NameError: name 'tokenizer' is not defined" }

But when I define a tokenizer it says map() doesn't take a tokenizer.

Am I doing something wrong here? I've tried typing everything myself and also copying the code directly from the page.

https://huggingface.co/docs/transformers/en/tasks/masked_language_modeling

YaBoyBigPat commented 1 week ago

I think I solved it I had to import the tokenizer inside of the preprocess_function() "def preprocess_function(examples):

Import AutoTokenizer inside the function to ensure it's recognized in multiprocessing context

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
return tokenizer([" ".join(x) for x in examples["questions.text"]])"

Odd, but it worked.