Val: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 98/98 [00:39<00:00, 2.51it/s]
/home/aistudio/swiftenv/lib/python3.11/site-packages/peft/utils/save_and_load.py:202: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.
warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Traceback (most recent call last):
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/cli/sft.py", line 5, in <module>
sft_main()
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/utils/run_utils.py", line 32, in x_main
result = llm_x(args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/llm/sft.py", line 541, in llm_sft
return trainer_train(args, model, template, train_dataset, val_dataset, callbacks=callbacks, msg=msg)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/llm/sft.py", line 491, in trainer_train
trainer.train(training_args.resume_from_checkpoint)
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/trainers/mixin.py", line 488, in train
res = super().train(resume_from_checkpoint, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/transformers/trainer.py", line 2122, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/transformers/trainer.py", line 2541, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/trainers/mixin.py", line 564, in _maybe_log_save_evaluate
super()._maybe_log_save_evaluate(tr_loss, *args, **kwargs)
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/transformers/trainer.py", line 3000, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/trainers/mixin.py", line 386, in _save_checkpoint
result = super()._save_checkpoint(model, trial, metrics)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/transformers/trainer.py", line 3090, in _save_checkpoint
self.save_model(output_dir, _internal_call=True)
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/transformers/trainer.py", line 3719, in save_model
self._save(output_dir)
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/trainers/mixin.py", line 364, in _save
self._save_converted_model(output_dir)
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/swift/trainers/mixin.py", line 200, in _save_converted_model
model.save_pretrained(
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/peft/peft_model.py", line 340, in save_pretrained
output_state_dict = save_mutated_as_lora(
^^^^^^^^^^^^^^^^^^^^^
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/peft/peft_model.py", line 279, in save_mutated_as_lora
self.load_adapter(
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/peft/peft_model.py", line 1167, in load_adapter
self.base_model._cast_adapter_dtype(
File "/home/aistudio/swiftenv/lib/python3.11/site-packages/peft/tuners/tuners_utils.py", line 357, in _cast_adapter_dtype
param.data = param.data.to(torch.float32)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.39 GiB of which 10.06 MiB is free. Process 957932 has 39.38 GiB memory in use. Of the allocated memory 36.46 GiB is allocated by PyTorch, and 820.76 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Train: 25%|██████████████████████▊ | 150/611 [48:25<2:28:49, 19.37s/it]
环境信息
训练脚本
错误日志
补充信息
模型可以正常进行训练,训练过程显存占用为 31-34G,保存检查点时程序似乎尝试将一些参数转换为 fp32 并在此过程中 OOM。