File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/transformers/trainer.py", line 1835, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/transformers/trainer.py", line 2690, in training_step
self.accelerator.backward(loss)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1899, in backward
self.deepspeed_engine_wrapped.backward(loss, kwargs)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 176, in backward
self.engine.step()
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2144, in step
self._take_model_step(lr_kwargs)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2050, in _take_model_step
self.optimizer.step()
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, *kwargs)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 1972, in step
self._optimizer_step(sub_group_id)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 934, in _optimizer_step
self.optimizer.step()
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(args, kwargs)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, *kwargs)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
ret = func(self, args, **kwargs)
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/adamw.py", line 184, in step
adamw(
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/adamw.py", line 335, in adamw
func(
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/adamw.py", line 599, in _multi_tensor_adamw
exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.73 GiB. GPU 0 has a total capacty of 44.55 GiB of which 3.53 GiB is free. Including non-PyTorch memory, this process has 41.01 GiB memory in use. Of the allocated memory 32.77 GiB is allocated by PyTorch, and 7.71 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
0%| | 0/2500 [00:05<?, ?it/s]
[2023-12-21 12:59:30,117] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23662
[2023-12-21 12:59:30,117] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23663
[2023-12-21 12:59:30,545] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23664
[2023-12-21 12:59:30,972] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23665
[2023-12-21 12:59:31,358] [ERROR] [launch.py:321:sigkill_handler] ['/home/ps/miniconda3/envs/baichuan2/bin/python3.10', '-u', 'fine-tune.py', '--local_rank=3', '--report_to', 'none', '--data_path', 'data/belle_chat_ramdon_10k.json', '--model_name_or_path', 'baichuan-inc/Baichuan2-7B-Chat', '--output_dir', 'output', '--model_max_length', '512', '--num_train_epochs', '1', '--per_device_train_batch_size', '1', '--gradient_accumulation_steps', '1', '--save_strategy', 'epoch', '--learning_rate', '2e-5', '--lr_scheduler_type', 'constant', '--adam_beta1', '0.9', '--adam_beta2', '0.98', '--adam_epsilon', '1e-8', '--max_grad_norm', '1.0', '--weight_decay', '1e-4', '--warmup_ratio', '0.0', '--logging_steps', '1', '--gradient_checkpointing', 'True', '--deepspeed', 'ds_config.json', '--bf16', 'True', '--tf32', 'True'] exits with return code = 1
4卡 RTX A6000,nvlink,batch_size=1
+---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.98 Driver Version: 535.98 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA RTX A6000 Off | 00000000:18:00.0 Off | 0 | | 59% 69C P8 22W / 300W | 12MiB / 46068MiB | 0% Default | | | | N/A | +-----------------------------------------+----------------------+----------------------+ | 1 NVIDIA RTX A6000 Off | 00000000:3B:00.0 Off | 0 | | 73% 74C P8 26W / 300W | 12MiB / 46068MiB | 0% Default | | | | N/A | +-----------------------------------------+----------------------+----------------------+ | 2 NVIDIA RTX A6000 Off | 00000000:86:00.0 Off | Off | | 73% 74C P8 31W / 300W | 12MiB / 49140MiB | 0% Default | | | | N/A | +-----------------------------------------+----------------------+----------------------+ | 3 NVIDIA RTX A6000 Off | 00000000:AF:00.0 Off | 0 | | 59% 69C P8 30W / 300W | 12MiB / 46068MiB | 0% Default | | | | N/A | +-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| | 0 N/A N/A 1771 G /usr/lib/xorg/Xorg 4MiB | | 1 N/A N/A 1771 G /usr/lib/xorg/Xorg 4MiB | | 2 N/A N/A 1771 G /usr/lib/xorg/Xorg 4MiB | | 3 N/A N/A 1771 G /usr/lib/xorg/Xorg 4MiB | +---------------------------------------------------------------------------------------+
hostfile="" deepspeed --hostfile=$hostfile fine-tune.py \ --report_to "none" \ --data_path "data/belle_chat_ramdon_10k.json" \ --model_name_or_path "baichuan-inc/Baichuan2-7B-Chat" \ --output_dir "output" \ --model_max_length 512 \ --num_train_epochs 1 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ --save_strategy epoch \ --learning_rate 2e-5 \ --lr_scheduler_type constant \ --adam_beta1 0.9 \ --adam_beta2 0.98 \ --adam_epsilon 1e-8 \ --max_grad_norm 1.0 \ --weight_decay 1e-4 \ --warmup_ratio 0.0 \ --logging_steps 1 \ --gradient_checkpointing True \ --deepspeed ds_config.json \ --bf16 True \ --tf32 True
报错:
File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/transformers/trainer.py", line 1835, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/transformers/trainer.py", line 2690, in training_step self.accelerator.backward(loss) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1899, in backward self.deepspeed_engine_wrapped.backward(loss, kwargs) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 176, in backward self.engine.step() File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2144, in step self._take_model_step(lr_kwargs) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2050, in _take_model_step self.optimizer.step() File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, *kwargs) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 1972, in step self._optimizer_step(sub_group_id) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 934, in _optimizer_step self.optimizer.step() File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper return wrapped(args, kwargs) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper out = func(*args, *kwargs) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 76, in _use_grad ret = func(self, args, **kwargs) File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/adamw.py", line 184, in step adamw( File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/adamw.py", line 335, in adamw func( File "/home/ps/miniconda3/envs/baichuan2/lib/python3.10/site-packages/torch/optim/adamw.py", line 599, in _multi_tensor_adamw exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs) torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.73 GiB. GPU 0 has a total capacty of 44.55 GiB of which 3.53 GiB is free. Including non-PyTorch memory, this process has 41.01 GiB memory in use. Of the allocated memory 32.77 GiB is allocated by PyTorch, and 7.71 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF 0%| | 0/2500 [00:05<?, ?it/s] [2023-12-21 12:59:30,117] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23662 [2023-12-21 12:59:30,117] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23663 [2023-12-21 12:59:30,545] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23664 [2023-12-21 12:59:30,972] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 23665 [2023-12-21 12:59:31,358] [ERROR] [launch.py:321:sigkill_handler] ['/home/ps/miniconda3/envs/baichuan2/bin/python3.10', '-u', 'fine-tune.py', '--local_rank=3', '--report_to', 'none', '--data_path', 'data/belle_chat_ramdon_10k.json', '--model_name_or_path', 'baichuan-inc/Baichuan2-7B-Chat', '--output_dir', 'output', '--model_max_length', '512', '--num_train_epochs', '1', '--per_device_train_batch_size', '1', '--gradient_accumulation_steps', '1', '--save_strategy', 'epoch', '--learning_rate', '2e-5', '--lr_scheduler_type', 'constant', '--adam_beta1', '0.9', '--adam_beta2', '0.98', '--adam_epsilon', '1e-8', '--max_grad_norm', '1.0', '--weight_decay', '1e-4', '--warmup_ratio', '0.0', '--logging_steps', '1', '--gradient_checkpointing', 'True', '--deepspeed', 'ds_config.json', '--bf16', 'True', '--tf32', 'True'] exits with return code = 1
请问是什么问题?可以通过修改参数跑全量微调吗?