Map (num_proc=16): 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5470/5834 [00:01<00:00, 5084.06 examples/s]
RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/multiprocess/pool.py", line 125, in worker
result = (True, func(args, kwds))
^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/utils/py_utils.py", line 625, in _write_generator_to_queue
for i, result in enumerate(func(kwargs)):
File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3482, in _map_single
batch = apply_function_on_filtered_inputs(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3361, in apply_function_on_filtered_inputs
processed_inputs = function(fn_args, *additional_args, **fn_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chatglm3/chatglm3_6b/finetune_demo/finetune_hf.py", line 296, in process_batch
new_input_ids = tokenizer.build_single_message(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/tokenization_chatglm.py", line 225, in build_single_message
message_tokens = self.tokenizer.encode(message)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/tokenization_chatglm.py", line 53, in encode
assert type(s) is str
^^^^^^^^^^^^^^
AssertionError
"""
The above exception was the direct cause of the following exception:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/chatglm3/chatglm3_6b/finetune_demo/finetune_hf.py:450 in main │
│ │
│ 447 │ tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_co │
│ 448 │ data_manager = DataManager(data_dir, ft_config.data_config) │
│ 449 │ │
│ ❱ 450 │ train_dataset = data_manager.get_dataset( │
│ 451 │ │ Split.TRAIN, │
│ 452 │ │ functools.partial( │
│ 453 │ │ │ process_batch, │
│ │
│ /home/chatglm3/chatglm3_6b/finetune_demo/finetune_hf.py:250 in get_dataset │
│ │
│ 247 │ │ │ remove_columns = orig_dataset.column_names │
│ 248 │ │ else: │
│ 249 │ │ │ remove_columns = None │
│ ❱ 250 │ │ return orig_dataset.map( │
│ 251 │ │ │ process_fn, │
│ 252 │ │ │ batched=batched, │
│ 253 │ │ │ remove_columns=remove_columns, │
│ │
│ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arr │
│ ow_dataset.py:593 in wrapper │
│ │
│ 590 │ │ else: │
│ 591 │ │ │ self: "Dataset" = kwargs.pop("self") │
│ 592 │ │ # apply actual function │
│ ❱ 593 │ │ out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs) │
│ 594 │ │ datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [ou │
│ 595 │ │ for dataset in datasets: │
│ 596 │ │ │ # Remove task templates if a column mapping of the template is no longer val │
│ │
│ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arr │
│ ow_dataset.py:558 in wrapper │
│ │
│ 555 │ │ │ "output_all_columns": self._output_all_columns, │
│ 556 │ │ } │
│ 557 │ │ # apply actual function │
│ ❱ 558 │ │ out: Union["Dataset", "DatasetDict"] = func(self, args, **kwargs) │
│ 559 │ │ datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [ou │
│ 560 │ │ # re-apply format to the output │
│ 561 │ │ for dataset in datasets: │
│ │
│ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arr │
│ ow_dataset.py:3197 in map │
│ │
│ 3194 │ │ │ │ │ │ total=pbar_total, │
│ 3195 │ │ │ │ │ │ desc=(desc or "Map") + f" (num_proc={num_proc})", │
│ 3196 │ │ │ │ │ ) as pbar: │
│ ❱ 3197 │ │ │ │ │ │ for rank, done, content in iflatmap_unordered( │
│ 3198 │ │ │ │ │ │ │ pool, Dataset._map_single, kwargs_iterable=kwargs_per_job │
│ 3199 │ │ │ │ │ │ ): │
│ 3200 │ │ │ │ │ │ │ if done: │
│ │
│ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/uti │
│ ls/py_utils.py:665 in iflatmap_unordered │
│ │
│ 662 │ │ finally: │
│ 663 │ │ │ if not pool_changed: │
│ 664 │ │ │ │ # we get the result in case there's an error to raise │
│ ❱ 665 │ │ │ │ [async_result.get(timeout=0.05) for async_result in async_results] │
│ 666 │
│ │
│ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/uti │
│ ls/py_utils.py:665 in │
│ │
│ 662 │ │ finally: │
│ 663 │ │ │ if not pool_changed: │
│ 664 │ │ │ │ # we get the result in case there's an error to raise │
│ ❱ 665 │ │ │ │ [async_result.get(timeout=0.05) for async_result in async_results] │
│ 666 │
│ │
│ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/multiprocess │
│ /pool.py:774 in get │
│ │
│ 771 │ │ if self._success: │
│ 772 │ │ │ return self._value │
│ 773 │ │ else: │
│ ❱ 774 │ │ │ raise self._value │
│ 775 │ │
│ 776 │ def _set(self, i, obj): │
│ 777 │ │ self._success, self._value = obj │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AssertionError
System Info / 系統信息
centos7.9
Who can help? / 谁可以帮助到您?
No response
Information / 问题信息
Reproduction / 复现过程
Map (num_proc=16): 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5470/5834 [00:01<00:00, 5084.06 examples/s] RemoteTraceback: """ Traceback (most recent call last): File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/multiprocess/pool.py", line 125, in worker result = (True, func(args, kwds)) ^^^^^^^^^^^^^^^^^^^ File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/utils/py_utils.py", line 625, in _write_generator_to_queue for i, result in enumerate(func(kwargs)): File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3482, in _map_single batch = apply_function_on_filtered_inputs( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3361, in apply_function_on_filtered_inputs processed_inputs = function(fn_args, *additional_args, **fn_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/chatglm3/chatglm3_6b/finetune_demo/finetune_hf.py", line 296, in process_batch new_input_ids = tokenizer.build_single_message( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/tokenization_chatglm.py", line 225, in build_single_message message_tokens = self.tokenizer.encode(message) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/tokenization_chatglm.py", line 53, in encode assert type(s) is str ^^^^^^^^^^^^^^ AssertionError """
The above exception was the direct cause of the following exception:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/chatglm3/chatglm3_6b/finetune_demo/finetune_hf.py:450 in main │ │ │ │ 447 │ tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_co │ │ 448 │ data_manager = DataManager(data_dir, ft_config.data_config) │ │ 449 │ │ │ ❱ 450 │ train_dataset = data_manager.get_dataset( │ │ 451 │ │ Split.TRAIN, │ │ 452 │ │ functools.partial( │ │ 453 │ │ │ process_batch, │ │ │ │ /home/chatglm3/chatglm3_6b/finetune_demo/finetune_hf.py:250 in get_dataset │ │ │ │ 247 │ │ │ remove_columns = orig_dataset.column_names │ │ 248 │ │ else: │ │ 249 │ │ │ remove_columns = None │ │ ❱ 250 │ │ return orig_dataset.map( │ │ 251 │ │ │ process_fn, │ │ 252 │ │ │ batched=batched, │ │ 253 │ │ │ remove_columns=remove_columns, │ │ │ │ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arr │ │ ow_dataset.py:593 in wrapper │ │ │ │ 590 │ │ else: │ │ 591 │ │ │ self: "Dataset" = kwargs.pop("self") │ │ 592 │ │ # apply actual function │ │ ❱ 593 │ │ out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs) │ │ 594 │ │ datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [ou │ │ 595 │ │ for dataset in datasets: │ │ 596 │ │ │ # Remove task templates if a column mapping of the template is no longer val │ │ │ │ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arr │ │ ow_dataset.py:558 in wrapper │ │ │ │ 555 │ │ │ "output_all_columns": self._output_all_columns, │ │ 556 │ │ } │ │ 557 │ │ # apply actual function │ │ ❱ 558 │ │ out: Union["Dataset", "DatasetDict"] = func(self, args, **kwargs) │ │ 559 │ │ datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [ou │ │ 560 │ │ # re-apply format to the output │ │ 561 │ │ for dataset in datasets: │ │ │ │ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/arr │ │ ow_dataset.py:3197 in map │ │ │ │ 3194 │ │ │ │ │ │ total=pbar_total, │ │ 3195 │ │ │ │ │ │ desc=(desc or "Map") + f" (num_proc={num_proc})", │ │ 3196 │ │ │ │ │ ) as pbar: │ │ ❱ 3197 │ │ │ │ │ │ for rank, done, content in iflatmap_unordered( │ │ 3198 │ │ │ │ │ │ │ pool, Dataset._map_single, kwargs_iterable=kwargs_per_job │ │ 3199 │ │ │ │ │ │ ): │ │ 3200 │ │ │ │ │ │ │ if done: │ │ │ │ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/uti │ │ ls/py_utils.py:665 in iflatmap_unordered │ │ │ │ 662 │ │ finally: │ │ 663 │ │ │ if not pool_changed: │ │ 664 │ │ │ │ # we get the result in case there's an error to raise │ │ ❱ 665 │ │ │ │ [async_result.get(timeout=0.05) for async_result in async_results] │ │ 666 │ │ │ │ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/datasets/uti │ │ ls/py_utils.py:665 in │
│ │
│ 662 │ │ finally: │
│ 663 │ │ │ if not pool_changed: │
│ 664 │ │ │ │ # we get the result in case there's an error to raise │
│ ❱ 665 │ │ │ │ [async_result.get(timeout=0.05) for async_result in async_results] │
│ 666 │
│ │
│ /home/user/anaconda3/envs/ChatGML3-6b_finetunning_test/lib/python3.11/site-packages/multiprocess │
│ /pool.py:774 in get │
│ │
│ 771 │ │ if self._success: │
│ 772 │ │ │ return self._value │
│ 773 │ │ else: │
│ ❱ 774 │ │ │ raise self._value │
│ 775 │ │
│ 776 │ def _set(self, i, obj): │
│ 777 │ │ self._success, self._value = obj │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AssertionError
Expected behavior / 期待表现
微调数据条数5000+时包如上错误,用过少量数据是没有问题的,请问是跟如下lora参数相关吗? lora参数: