issues
search
DUOMO
/
TransGPT
MIT License
707
stars
77
forks
source link
调试训练模型时报错AssertionError: No inf checks were recorded for this optimizer.
#13
Open
kkcondy
opened
1 year ago
kkcondy
commented
1 year ago
/data/anaconda3/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set
no_deprecation_warning=True
to disable this warning warnings.warn( 0%| | 0/1 [00:00<?, ?it/s]/data/anaconda3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:318: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /data/kk/TransGPT-main/supervised_finetuning.py:877 in
│ │ │ │ 874 │ │ 875 │ │ 876 if
name
== "
main
": │ │ ❱ 877 │ main() │ │ 878 │ │ │ │ /data/kk/TransGPT-main/supervised_finetuning.py:848 in main │ │ │ │ 845 │ │ checkpoint = None │ │ 846 │ │ if training_args.resume_from_checkpoint is not None: │ │ 847 │ │ │ checkpoint = training_args.resume_from_checkpoint │ │ ❱ 848 │ │ train_result = trainer.train(resume_from_checkpoint=checkpoint) │ │ 849 │ │ │ │ 850 │ │ metrics = train_result.metrics │ │ 851 │ │ metrics["train_samples"] = max_train_samples │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1645 in train │ │ │ │ 1642 │ │ inner_training_loop = find_executable_batch_size( │ │ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1644 │ │ ) │ │ ❱ 1645 │ │ return inner_training_loop( │ │ 1646 │ │ │ args=args, │ │ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1648 │ │ │ trial=trial, │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2007 in │ │ _inner_training_loop │ │ │ │ 2004 │ │ │ │ │ │ scale_after = self.scaler.get_scale() │ │ 2005 │ │ │ │ │ │ optimizer_was_run = scale_before <= scale_after │ │ 2006 │ │ │ │ │ else: │ │ ❱ 2007 │ │ │ │ │ │ self.optimizer.step() │ │ 2008 │ │ │ │ │ │ optimizer_was_run = not self.accelerator.optimizer_step_was_skip │ │ 2009 │ │ │ │ │ │ │ 2010 │ │ │ │ │ if optimizer_was_run: │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/accelerate/optimizer.py:134 in step │ │ │ │ 131 │ │ │ │ xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args) │ │ 132 │ │ │ elif self.scaler is not None: │ │ 133 │ │ │ │ scale_before = self.scaler.get_scale() │ │ ❱ 134 │ │ │ │ self.scaler.step(self.optimizer, closure) │ │ 135 │ │ │ │ self.scaler.update() │ │ 136 │ │ │ │ scale_after = self.scaler.get_scale() │ │ 137 │ │ │ │ # If we reduced the loss scale, it means the optimizer step was skipped │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:372 in step │ │ │ │ 369 │ │ if optimizer
state["stage"] is OptState.READY: │ │ 370 │ │ │ self.unscale
(optimizer) │ │ 371 │ │ │ │ ❱ 372 │ │ assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were rec │ │ 373 │ │ │ │ 374 │ │ retval = self._maybe_opt_step(optimizer, optimizer_state, *args, *
kwargs) │ │ 375 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AssertionError: No inf checks were recorded for this optimizer. 0%| | 0/1 [00:06<?, ?it/s] ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 49028) of binary: /data/anaconda3/bin/python Traceback (most recent call last): File "/data/anaconda3/bin/torchrun", line 8, in
sys.exit(main()) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/
init
.py", line 346, in wrapper return f(
args, **kwargs) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main run(args) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run elastic_launch( File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in
call
return launch_agent(self._config, self._entrypoint, list(args)) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
supervised_finetuning.py FAILED
Failures:
------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2023-08-17_15:35:31 host : njxg-its-gpu01.njxg.baidu.com rank : 0 (local_rank: 0) exitcode : 1 (pid: 49028) error_file:
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================ 我的 run_sft为: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node 1 supervised_finetuning.py \ --model_type llama \ --model_name_or_path ./DUOMO-Lab/TransGPT-v0 \ --train_file_dir ./data/finetune \ --validation_file_dir ./data/finetune \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --do_train \ --do_eval \ --use_peft True \ --fp16 \ --max_train_samples 2 \ --max_eval_samples 2 \ --load_in_8bit True \ --num_train_epochs 1 \ --learning_rate 2e-5 \ --warmup_ratio 0.05 \ --weight_decay 0.05 \ --logging_strategy steps \ --logging_steps 10 \ --eval_steps 50 \ --evaluation_strategy steps \ --save_steps 500 \ --save_strategy steps \ --save_total_limit 3 \ --gradient_accumulation_steps 1 \ --preprocessing_num_workers 4 \ --output_dir ./outputs-sft-v1 \ --overwrite_output_dir \ --ddp_timeout 30000 \ --logging_first_step True \ --target_modules all \ --lora_rank 8 \ --lora_alpha 16 \ --lora_dropout 0.05 \ --torch_dtype float16 \ --device_map auto \ --report_to tensorboard \
/data/anaconda3/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set │
│ │
│ 874 │
│ 875 │
│ 876 if name == "main": │
│ ❱ 877 │ main() │
│ 878 │
│ │
│ /data/kk/TransGPT-main/supervised_finetuning.py:848 in main │
│ │
│ 845 │ │ checkpoint = None │
│ 846 │ │ if training_args.resume_from_checkpoint is not None: │
│ 847 │ │ │ checkpoint = training_args.resume_from_checkpoint │
│ ❱ 848 │ │ train_result = trainer.train(resume_from_checkpoint=checkpoint) │
│ 849 │ │ │
│ 850 │ │ metrics = train_result.metrics │
│ 851 │ │ metrics["train_samples"] = max_train_samples │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1645 in train │
│ │
│ 1642 │ │ inner_training_loop = find_executable_batch_size( │
│ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1644 │ │ ) │
│ ❱ 1645 │ │ return inner_training_loop( │
│ 1646 │ │ │ args=args, │
│ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1648 │ │ │ trial=trial, │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2007 in │
│ _inner_training_loop │
│ │
│ 2004 │ │ │ │ │ │ scale_after = self.scaler.get_scale() │
│ 2005 │ │ │ │ │ │ optimizer_was_run = scale_before <= scale_after │
│ 2006 │ │ │ │ │ else: │
│ ❱ 2007 │ │ │ │ │ │ self.optimizer.step() │
│ 2008 │ │ │ │ │ │ optimizer_was_run = not self.accelerator.optimizer_step_was_skip │
│ 2009 │ │ │ │ │ │
│ 2010 │ │ │ │ │ if optimizer_was_run: │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/accelerate/optimizer.py:134 in step │
│ │
│ 131 │ │ │ │ xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args) │
│ 132 │ │ │ elif self.scaler is not None: │
│ 133 │ │ │ │ scale_before = self.scaler.get_scale() │
│ ❱ 134 │ │ │ │ self.scaler.step(self.optimizer, closure) │
│ 135 │ │ │ │ self.scaler.update() │
│ 136 │ │ │ │ scale_after = self.scaler.get_scale() │
│ 137 │ │ │ │ # If we reduced the loss scale, it means the optimizer step was skipped │
│ │
│ /data/anaconda3/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:372 in step │
│ │
│ 369 │ │ if optimizerstate["stage"] is OptState.READY: │
│ 370 │ │ │ self.unscale(optimizer) │
│ 371 │ │ │
│ ❱ 372 │ │ assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were rec │
│ 373 │ │ │
│ 374 │ │ retval = self._maybe_opt_step(optimizer, optimizer_state, *args, *kwargs) │
│ 375 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AssertionError: No inf checks were recorded for this optimizer.
0%| | 0/1 [00:06<?, ?it/s]
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 49028) of binary: /data/anaconda3/bin/python
Traceback (most recent call last):
File "/data/anaconda3/bin/torchrun", line 8, in
sys.exit(main())
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f( args, **kwargs)
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
no_deprecation_warning=True
to disable this warning warnings.warn( 0%| | 0/1 [00:00<?, ?it/s]/data/anaconda3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:318: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /data/kk/TransGPT-main/supervised_finetuning.py:877 insupervised_finetuning.py FAILED
Failures: