DUOMO / TransGPT

MIT License
707 stars 77 forks source link

调试训练模型时报错AssertionError: No inf checks were recorded for this optimizer. #13

Open kkcondy opened 1 year ago

kkcondy commented 1 year ago

/data/anaconda3/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set no_deprecation_warning=True to disable this warning warnings.warn( 0%| | 0/1 [00:00<?, ?it/s]/data/anaconda3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:318: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /data/kk/TransGPT-main/supervised_finetuning.py:877 in │ │ │ │ 874 │ │ 875 │ │ 876 if name == "main": │ │ ❱ 877 │ main() │ │ 878 │ │ │ │ /data/kk/TransGPT-main/supervised_finetuning.py:848 in main │ │ │ │ 845 │ │ checkpoint = None │ │ 846 │ │ if training_args.resume_from_checkpoint is not None: │ │ 847 │ │ │ checkpoint = training_args.resume_from_checkpoint │ │ ❱ 848 │ │ train_result = trainer.train(resume_from_checkpoint=checkpoint) │ │ 849 │ │ │ │ 850 │ │ metrics = train_result.metrics │ │ 851 │ │ metrics["train_samples"] = max_train_samples │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1645 in train │ │ │ │ 1642 │ │ inner_training_loop = find_executable_batch_size( │ │ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1644 │ │ ) │ │ ❱ 1645 │ │ return inner_training_loop( │ │ 1646 │ │ │ args=args, │ │ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1648 │ │ │ trial=trial, │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2007 in │ │ _inner_training_loop │ │ │ │ 2004 │ │ │ │ │ │ scale_after = self.scaler.get_scale() │ │ 2005 │ │ │ │ │ │ optimizer_was_run = scale_before <= scale_after │ │ 2006 │ │ │ │ │ else: │ │ ❱ 2007 │ │ │ │ │ │ self.optimizer.step() │ │ 2008 │ │ │ │ │ │ optimizer_was_run = not self.accelerator.optimizer_step_was_skip │ │ 2009 │ │ │ │ │ │ │ 2010 │ │ │ │ │ if optimizer_was_run: │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/accelerate/optimizer.py:134 in step │ │ │ │ 131 │ │ │ │ xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args) │ │ 132 │ │ │ elif self.scaler is not None: │ │ 133 │ │ │ │ scale_before = self.scaler.get_scale() │ │ ❱ 134 │ │ │ │ self.scaler.step(self.optimizer, closure) │ │ 135 │ │ │ │ self.scaler.update() │ │ 136 │ │ │ │ scale_after = self.scaler.get_scale() │ │ 137 │ │ │ │ # If we reduced the loss scale, it means the optimizer step was skipped │ │ │ │ /data/anaconda3/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:372 in step │ │ │ │ 369 │ │ if optimizerstate["stage"] is OptState.READY: │ │ 370 │ │ │ self.unscale(optimizer) │ │ 371 │ │ │ │ ❱ 372 │ │ assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were rec │ │ 373 │ │ │ │ 374 │ │ retval = self._maybe_opt_step(optimizer, optimizer_state, *args, *kwargs) │ │ 375 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AssertionError: No inf checks were recorded for this optimizer. 0%| | 0/1 [00:06<?, ?it/s] ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 49028) of binary: /data/anaconda3/bin/python Traceback (most recent call last): File "/data/anaconda3/bin/torchrun", line 8, in sys.exit(main()) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper return f(args, **kwargs) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main run(args) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run elastic_launch( File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/data/anaconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

supervised_finetuning.py FAILED

Failures:

------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2023-08-17_15:35:31 host : njxg-its-gpu01.njxg.baidu.com rank : 0 (local_rank: 0) exitcode : 1 (pid: 49028) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================ 我的 run_sft为: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node 1 supervised_finetuning.py \ --model_type llama \ --model_name_or_path ./DUOMO-Lab/TransGPT-v0 \ --train_file_dir ./data/finetune \ --validation_file_dir ./data/finetune \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --do_train \ --do_eval \ --use_peft True \ --fp16 \ --max_train_samples 2 \ --max_eval_samples 2 \ --load_in_8bit True \ --num_train_epochs 1 \ --learning_rate 2e-5 \ --warmup_ratio 0.05 \ --weight_decay 0.05 \ --logging_strategy steps \ --logging_steps 10 \ --eval_steps 50 \ --evaluation_strategy steps \ --save_steps 500 \ --save_strategy steps \ --save_total_limit 3 \ --gradient_accumulation_steps 1 \ --preprocessing_num_workers 4 \ --output_dir ./outputs-sft-v1 \ --overwrite_output_dir \ --ddp_timeout 30000 \ --logging_first_step True \ --target_modules all \ --lora_rank 8 \ --lora_alpha 16 \ --lora_dropout 0.05 \ --torch_dtype float16 \ --device_map auto \ --report_to tensorboard \