Traceback (most recent call last):
File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 54, in run_exp
run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/workspace/LlamaFactory/src/llamafactory/train/ppo/workflow.py", line 73, in run_ppo
ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/workspace/LlamaFactory/src/llamafactory/train/ppo/trainer.py", line 250, in ppo_train
mini_batch_queries, mini_batch_responses = self.get_inputs(
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/LlamaFactory/src/llamafactory/train/ppo/trainer.py", line 354, in get_inputs
generate_output: "torch.Tensor" = unwrapped_model.generate(
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/trl/models/modeling_value_head.py", line 209, in generate
return self.pretrained_model.generate(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/peft/peft_model.py", line 1638, in generate
outputs = self.base_model.generate(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/transformers/generation/utils.py", line 1989, in generate
result = self._sample(
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/transformers/generation/utils.py", line 2932, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1145, in forward
outputs = self.model(
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 948, in forward
layer_outputs = decoder_layer(
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 681, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/miniconda3/envs/env-3.9.16/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 372, in forward
attn_weights = attn_weights + causal_mask
RuntimeError: The size of tensor a (139) must match the size of tensor b (138) at non-singleton dimension 3
Reminder
System Info
llamafactory
version: 0.8.4.dev0Reproduction
模型:llama3 8b 使用以下配置进行ppo微调(910b * 2):
报错如下:
将zero3改为zero2 / zero0 /关闭deepspeed均无报错,可正常微调 a100 * 2 使用上述配置无报错
Expected behavior
No response
Others
No response