[X] I have read the README and searched the existing issues.
Reproduction
Traceback (most recent call last):
File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/train_bash.py", line 14, in
main()
File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/train_bash.py", line 5, in main
run_exp()
File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/llmtuner/train/tuner.py", line 31, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/llmtuner/train/sft/workflow.py", line 75, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
return inner_training_loop(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 2902, in training_step
loss = self.compute_loss(model, inputs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 2925, in compute_loss
outputs = model(inputs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/accelerate/utils/operations.py", line 817, in forward
return model_forward(args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/accelerate/utils/operations.py", line 805, in call
return convert_to_fp32(self.model_forward(*args, kwargs))
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
return func(*args, *kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/peft/peft_model.py", line 1083, in forward
return self.base_model(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
return self.model.forward(*args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 1067, in forward
outputs = self.model(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 894, in forward
layer_outputs = self._gradient_checkpointing_func(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, args)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(args, kwargs) # type: ignore[misc]
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(args)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, *kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 625, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 280, in forward
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
RuntimeError: shape '[8, 768, 3072]' is invalid for input of size 25165824
Reminder
Reproduction
Traceback (most recent call last): File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/train_bash.py", line 14, in
main()
File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/train_bash.py", line 5, in main
run_exp()
File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/llmtuner/train/tuner.py", line 31, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/mlx_devbox/users/xiao.gao/repo/5012/LLaMA-Efficient-Tuning/src/llmtuner/train/sft/workflow.py", line 75, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
return inner_training_loop(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 2902, in training_step
loss = self.compute_loss(model, inputs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/trainer.py", line 2925, in compute_loss
outputs = model(inputs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/accelerate/utils/operations.py", line 817, in forward
return model_forward(args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/accelerate/utils/operations.py", line 805, in call
return convert_to_fp32(self.model_forward(*args, kwargs))
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
return func(*args, *kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/peft/peft_model.py", line 1083, in forward
return self.base_model(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
return self.model.forward(*args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 1067, in forward
outputs = self.model(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 894, in forward
layer_outputs = self._gradient_checkpointing_func(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, args)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(args, kwargs) # type: ignore[misc]
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(args)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, *kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 625, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.pyenv/versions/train/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py", line 280, in forward
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
RuntimeError: shape '[8, 768, 3072]' is invalid for input of size 25165824
Expected behavior
No response
System Info
No response
Others
No response