Open huyiwen opened 3 months ago
Traceback (most recent call last):
File "train.py", line 395, in <module>
train()
File "train.py", line 389, in train
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File ".../site-packages/transformers/trainer.py", line 1938, in train
return inner_training_loop(
File ".../site-packages/transformers/trainer.py", line 2279, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File ".../site-packages/transformers/trainer.py", line 3318, in training_step
loss = self.compute_loss(model, inputs)
File ".../site-packages/transformers/trainer.py", line 3363, in compute_loss
outputs = model(**inputs)
File ".../site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File ".../site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File ".../site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File ".../site-packages/deepspeed/runtime/engine.py", line 1822, in forward
loss = self.module(*inputs, **kwargs)
File ".../site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File ".../site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "model/modeling_miniyulan.py", line 1255, in forward
outputs = self.model(
File ".../site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File ".../site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "model/modeling_miniyulan.py", line 1057, in forward
layer_outputs = self._gradient_checkpointing_func(
File ".../site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 985, in checkpoint
CheckpointFunction.apply(function, all_outputs, *args)
File ".../site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File ".../site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 562, in forward
outputs = run_function(*inputs_cuda)
File ".../site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File ".../site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "model/modeling_miniyulan.py", line 801, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File ".../site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File ".../site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "model/modeling_miniyulan.py", line 517, in forward
bsz, q_len, _ = hidden_states.size()
ValueError: not enough values to unpack (expected 3, got 2)
cc @muellerzr and @SunMarc
I am interested to try out partition_activations from deepspeed. Has anybody found a solution to use it with transformers?
Has anyone found a solution to this problem?
Feature request
Is there a tutorial for using DeepSpeed's activation checkpointing instead of PyTorch's?
I'm using
Trainer
with ZeRO integration to train my model. Here's my code:However, I got this in FlashAttention2:
Motivation
It seems there isn't such a tutorial available at the moment in either deepspeed's tutorial or huggingface.
Your contribution
Provide my results