Please provide a clear and concise description of what the question is.
Running tokenizer on dataset (num_proc=10): 0%| | 0/17 [00:13<?, ? examples/s]
multiprocess.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/multiprocess/pool.py", line 125, in worker
result = (True, func(args, kwds))
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 614, in _write_generator_to_queue
for i, result in enumerate(func(kwargs)):
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3470, in _map_single
batch = apply_function_on_filtered_inputs(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3349, in apply_function_on_filtered_inputs
processed_inputs = function(fn_args, *additional_args, fn_kwargs)
File "/data/zxd/MedicalGPT/MedicalGPT-1.7.0/pretraining.py", line 416, in tokenize_function
tokenized_inputs = tokenizer(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2803, in call
encodings = self._call_one(text=text, text_pair=text_pair, all_kwargs)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2889, in _call_one
return self.batch_encode_plus(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3071, in batch_encode_plus
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2708, in _get_padding_truncation_strategies
raise ValueError(
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token(tokenizer.pad_token = tokenizer.eos_token e.g.) o r add a new pad token via tokenizer.add_special_tokens({'pad_token': '[PAD]'}).
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/data/zxd/MedicalGPT/MedicalGPT-1.7.0/pretraining.py", line 767, in
main()
File "/data/zxd/MedicalGPT/MedicalGPT-1.7.0/pretraining.py", line 556, in main
lm_datasets = raw_datasets.map(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/dataset_dict.py", line 868, in map
{
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/dataset_dict.py", line 869, in
k: dataset.map(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, kwargs)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3185, in map
for rank, done, content in iflatmap_unordered(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 654, in iflatmap_unordered
[async_result.get(timeout=0.05) for async_result in async_results]
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 654, in
[async_result.get(timeout=0.05) for async_result in async_results]
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/multiprocess/pool.py", line 774, in get
raise self._value
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token(tokenizer.pad_token = tokenizer.eos_token e.g.) o r add a new pad token via tokenizer.add_special_tokens({'pad_token': '[PAD]'}).
Running tokenizer on dataset (num_proc=10): 0%| | 0/17 [00:00<?, ? examples/s] [2024-02-02 17:18:31,232] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 10026 closing signal SIGTERM
[2024-02-02 17:18:31,296] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 10025) of binary: /data/anaconda3/envs/medical_gp t/bin/python
Traceback (most recent call last):
File "/data/anaconda3/envs/medical_gpt/bin/torchrun", line 8, in
sys.exit(main())
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(args, kwargs)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
看报错是pad_token (tokenizer.pad_token = tokenizer.eos_token e.g.) o r add a new pad token via tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 这里的问题,自己设置一个pad token就可以,根据自己使用的base model改下。
Describe the Question
Please provide a clear and concise description of what the question is. Running tokenizer on dataset (num_proc=10): 0%| | 0/17 [00:13<?, ? examples/s] multiprocess.pool.RemoteTraceback: """ Traceback (most recent call last): File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/multiprocess/pool.py", line 125, in worker result = (True, func(args, kwds)) File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 614, in _write_generator_to_queue for i, result in enumerate(func(kwargs)): File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3470, in _map_single batch = apply_function_on_filtered_inputs( File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3349, in apply_function_on_filtered_inputs processed_inputs = function(fn_args, *additional_args, fn_kwargs) File "/data/zxd/MedicalGPT/MedicalGPT-1.7.0/pretraining.py", line 416, in tokenize_function tokenized_inputs = tokenizer( File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2803, in call encodings = self._call_one(text=text, text_pair=text_pair, all_kwargs) File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2889, in _call_one return self.batch_encode_plus( File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3071, in batch_encode_plus padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2708, in _get_padding_truncation_strategies raise ValueError( ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token
(tokenizer.pad_token = tokenizer.eos_token e.g.)
o r add a new pad token viatokenizer.add_special_tokens({'pad_token': '[PAD]'})
. """The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/data/zxd/MedicalGPT/MedicalGPT-1.7.0/pretraining.py", line 767, in
main()
File "/data/zxd/MedicalGPT/MedicalGPT-1.7.0/pretraining.py", line 556, in main
lm_datasets = raw_datasets.map(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/dataset_dict.py", line 868, in map
{
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/dataset_dict.py", line 869, in
k: dataset.map(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, kwargs)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3185, in map
for rank, done, content in iflatmap_unordered(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 654, in iflatmap_unordered
[async_result.get(timeout=0.05) for async_result in async_results]
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 654, in
[async_result.get(timeout=0.05) for async_result in async_results]
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/multiprocess/pool.py", line 774, in get
raise self._value
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
sys.exit(main())
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f( args, kwargs)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/data/anaconda3/envs/medical_gpt/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
pad_token
(tokenizer.pad_token = tokenizer.eos_token e.g.)
o r add a new pad token viatokenizer.add_special_tokens({'pad_token': '[PAD]'})
. Running tokenizer on dataset (num_proc=10): 0%| | 0/17 [00:00<?, ? examples/s] [2024-02-02 17:18:31,232] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 10026 closing signal SIGTERM [2024-02-02 17:18:31,296] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 10025) of binary: /data/anaconda3/envs/medical_gp t/bin/python Traceback (most recent call last): File "/data/anaconda3/envs/medical_gpt/bin/torchrun", line 8, inpretraining.py FAILED
Failures: