Closed taofennanhai closed 1 year ago
使用2块V100GPU 训练LLaMA-13B模型曝出错误ValueError: Attempting to unscale FP16 gradients.
Traceback (most recent call last): File "finetune.py", line 271, in <module> trainer.train(resume_from_checkpoint=args.resume_from_checkpoint) File "/home/whzhu_st/mypython/lib/python3.8/site-packages/transformers/trainer.py", line 1664, in train return inner_training_loop( File "/home/whzhu_st/mypython/lib/python3.8/site-packages/transformers/trainer.py", line 1973, in _inner_training_loop self.scaler.unscale_(self.optimizer) File "/home/whzhu_st/mypython/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 282, in unscale_ optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False) File "/home/whzhu_st/mypython/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 210, in _unscale_grads_ raise ValueError("Attempting to unscale FP16 gradients.") ValueError: Attempting to unscale FP16 gradients. ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/whzhu_st/Pytorch/GPT/Chinese-Vicuna/finetune.py:271 in <module> │ │ │ │ 268 │ │ 269 print("\n If there's a warning about missing keys above, please disregard :)") │ │ 270 │ │ ❱ 271 trainer.train(resume_from_checkpoint=args.resume_from_checkpoint) │ │ 272 │ │ 273 model.save_pretrained(OUTPUT_DIR) │ │ 274 │ │ │ │ /home/whzhu_st/mypython/lib/python3.8/site-packages/transformers/trainer.py:1664 in train │ │ │ │ 1661 │ │ inner_training_loop = find_executable_batch_size( │ │ 1662 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1663 │ │ ) │ │ ❱ 1664 │ │ return inner_training_loop( │ │ 1665 │ │ │ args=args, │ │ 1666 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1667 │ │ │ trial=trial, │ │ │ │ /home/whzhu_st/mypython/lib/python3.8/site-packages/transformers/trainer.py:1973 in │ │ _inner_training_loop │ │ │ │ 1970 │ │ │ │ │ │ │ │ gradients = xm._fetch_gradients(self.optimizer) │ │ 1971 │ │ │ │ │ │ │ │ xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world │ │ 1972 │ │ │ │ │ │ │ # AMP: gradients need unscaling │ │ ❱ 1973 │ │ │ │ │ │ │ self.scaler.unscale_(self.optimizer) │ │ 1974 │ │ │ │ │ │ │ │ 1975 │ │ │ │ │ │ if is_sagemaker_mp_enabled() and args.fp16: │ │ 1976 │ │ │ │ │ │ │ self.optimizer.clip_master_grads(args.max_grad_norm) │ │ │ │ /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py:282 in │ │ unscale_ │ │ │ │ 279 │ │ inv_scale = self._scale.double().reciprocal().float() │ │ 280 │ │ found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device │ │ 281 │ │ │ │ ❱ 282 │ │ optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_sc │ │ 283 │ │ optimizer_state["stage"] = OptState.UNSCALED │ │ 284 │ │ │ 285 │ def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs): │ │ │ │ /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py:210 in │ │ _unscale_grads_ │ │ │ │ 207 │ │ │ │ │ if param.grad is None: │ │ 208 │ │ │ │ │ │ continue │ │ 209 │ │ │ │ │ if (not allow_fp16) and param.grad.dtype == torch.float16: │ │ ❱ 210 │ │ │ │ │ │ raise ValueError("Attempting to unscale FP16 gradients.") │ │ 211 │ │ │ │ │ if param.grad.is_sparse: │ │ 212 │ │ │ │ │ │ # is_coalesced() == False means the sparse grad has values with │ │ 213 │ │ │ │ │ │ # coalesce() deduplicates indices and adds all values that have │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ValueError: Attempting to unscale FP16 gradients.
我运行命令 python finetune.py --data_path sample/instruct/travel.jsonl --test_size 395 --output_path finetuned_model --wandb并且修改了finetune.py里面的内容
python finetune.py --data_path sample/instruct/travel.jsonl --test_size 395 --output_path finetuned_model --wandb
请问我这里还需要如何修改
可以用finetune_deepspeed.py,不开deepspeed就是对应的fp16版本
使用2块V100GPU 训练LLaMA-13B模型曝出错误ValueError: Attempting to unscale FP16 gradients.
我运行命令
python finetune.py --data_path sample/instruct/travel.jsonl --test_size 395 --output_path finetuned_model --wandb
并且修改了finetune.py里面的内容请问我这里还需要如何修改