Traceback (most recent call last):
File "src/train_bash.py", line 14, in <module>
main()
File "src/train_bash.py", line 5, in main
run_exp()
File "/data/qiucehao/project/code/huarong_code/adam-mini-code/LLaMA-Factory/src/llmtuner/train/tuner.py", line 32, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data/qiucehao/project/code/huarong_code/adam-mini-code/LLaMA-Factory/src/llmtuner/train/sft/workflow.py", line 73, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/transformers/trainer.py", line 1780, in train
return inner_training_loop(
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/transformers/trainer.py", line 2118, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/transformers/trainer.py", line 3045, in training_step
self.accelerator.backward(loss)
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/accelerate/accelerator.py", line 2121, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2181, in step
self._take_model_step(lr_kwargs)
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2087, in _take_model_step
self.optimizer.step()
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1920, in step
self._optimizer_step(i)
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1827, in _optimizer_step
self.optimizer.step()
File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, **kwargs)
File "/data/qiucehao/project/code/huarong_code/adam-mini-code/LLaMA-Factory/src/llmtuner/Adam_mini/Adam_mini_transformer.py", line 200, in step
h = (state["v"].sqrt() / bias_correction_2_sqrt).add_(eps)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.80 GiB. GPU 2 has a total capacty of 79.33 GiB of which 423.81 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 75.56 GiB is allocated by PyTorch, and 1.72 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
0%| | 0/2157 [00:24<?, ?it/s]
您好,我将Adam-mini集成到trainer后,使用deepspeed训练会爆显存 加载代码如下:
训练脚本如下:
ds_config配置如下:
报错信息如下: