microsoft / DeepSpeedExamples

Example models using DeepSpeed
Apache License 2.0
5.97k stars 1.01k forks source link

step3 failed actor opt_1.3b critic opt_350m Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run #419

Open BaiStone2017 opened 1 year ago

BaiStone2017 commented 1 year ago

setting as follow: deepspeed --master_port 12346 main.py \ --data_path yitingxie/rlhf-reward-datasets \ --data_split 2,4,4 \ --actor_model_name_or_path $ACTOR_MODEL_PATH \ --critic_model_name_or_path $CRITIC_MODEL_PATH \ --num_padding_at_beginning 1 \ --per_device_train_batch_size 4 \ --per_device_mini_train_batch_size 4 \ --generation_batch_numbers 1 \ --ppo_epochs 1 \ --max_answer_seq_len 128 \ --max_prompt_seq_len 128 \ --actor_learning_rate ${Actor_Lr} \ --critic_learning_rate ${Critic_Lr} \ --actor_weight_decay 0.1 \ --critic_weight_decay 0.1 \ --num_train_epochs 1 \ --lr_scheduler_type cosine \ --gradient_accumulation_steps 1 \ --num_warmup_steps 100 \ --deepspeed --seed 1234 \ --actor_zero_stage $ACTOR_ZERO_STAGE \ --critic_zero_stage $CRITIC_ZERO_STAGE \ --output_dir $OUTPUT \ &> $OUTPUT/training.log

running log: `|E2E latency=6.48s |Gather latency=0.19s (2.87%) |Generate time=2.96s (45.67%) |Training time=1.90s (29.33%) |Others=1.62 (25.00%)|CurSamplesPerSec=2.47 |AvgSamplesPerSec=2.35 Traceback (most recent call last): Traceback (most recent call last): Traceback (most recent call last): File "main.py", line 526, in File "main.py", line 526, in File "main.py", line 526, in Traceback (most recent call last): File "main.py", line 526, in main()main()main()

File "main.py", line 449, in main File "main.py", line 449, in main File "main.py", line 449, in main main() File "main.py", line 449, in main actor_loss, critic_loss = trainer.train_rlhf(exp_data)actor_loss, critic_loss = trainer.train_rlhf(exp_data)

  File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf

actor_loss, critic_loss = trainer.train_rlhf(exp_data) File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf

File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf self.actor_model.step()self.actor_model.step()

  File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step self.actor_model.step()
actor_loss, critic_loss = trainer.train_rlhf(exp_data) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf self.actor_model.step()super().step(lr_kwargs=lr_kwargs)super().step(lr_kwargs=lr_kwargs)

super().step(lr_kwargs=lr_kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step super().step(lr_kwargs=lr_kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step self._take_model_step(lr_kwargs)self._take_model_step(lr_kwargs)self._take_model_step(lr_kwargs)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step self._take_model_step(lr_kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step self.optimizer.step() File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn self.optimizer.step() File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn self.optimizer.step()ret_val = func(*args, **kwargs)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step ret_val = func(*args, kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step ret_val = func(*args, *kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step self.optimizer.step() File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(args, kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step if self._overflow_check_and_loss_scale_update():
if self._overflow_check_and_loss_scale_update(): File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn

  File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn

if self._overflow_check_and_loss_scale_update(): File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update ret_val = func(*args, *kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update ret_val = func(args, kwargs)ret_val = func(*args, **kwargs)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update if self._overflow_check_and_loss_scale_update(): File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update self._update_scale(self.overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale self._update_scale(self.overflow) self._update_scale(self.overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale self._update_scale(self.overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale self.loss_scaler.update_scale(has_overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale self.loss_scaler.update_scale(has_overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale self.loss_scaler.update_scale(has_overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale raise Exception( Exception : raise Exception(Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.

self.loss_scaler.update_scale(has_overflow)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale Exception : raise Exception(Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.

Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run. raise Exception( Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.`

How to solve this problem?

EikeKohl commented 1 year ago

Maybe this helps? #335