# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.add_callback(SavePeftModelCallback)
# Training
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
trainer.save_model() # 自行新增
依赖情况(代码类问题务必提供)
# 请在此处粘贴依赖情况(请粘贴在本代码块里)
运行日志或截图
[INFO|trainer.py:1723] 2023-11-27 09:31:47,971 >> ***** Running training *****
[INFO|trainer.py:1724] 2023-11-27 09:31:47,971 >> Num examples = 2,187
[INFO|trainer.py:1725] 2023-11-27 09:31:47,971 >> Num Epochs = 1
[INFO|trainer.py:1726] 2023-11-27 09:31:47,971 >> Instantaneous batch size per device = 2
[INFO|trainer.py:1729] 2023-11-27 09:31:47,971 >> Total train batch size (w. parallel, distributed & accumulation) = 128
[INFO|trainer.py:1730] 2023-11-27 09:31:47,971 >> Gradient Accumulation steps = 8
[INFO|trainer.py:1731] 2023-11-27 09:31:47,971 >> Total optimization steps = 17
[INFO|trainer.py:1732] 2023-11-27 09:31:47,972 >> Number of trainable parameters = 6,929,256,448
0%| | 0/17 [00:00<?, ?it/s]/home/appuser/.local/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
warnings.warn(
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
/home/appuser/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1881: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
overflow_gpu = get_accelerator().ByteTensor([overflow])
[2023-11-27 09:31:50,512] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
{'loss': 2.8529, 'learning_rate': 0.0, 'epoch': 0.06}
6%|███████████▉ | 1/17 [00:02<00:40, 2.54s/it][2023-11-27 09:31:52,334] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
12%|███████████████████████▊ | 2/17 [00:04<00:31, 2.12s/it][2023-11-27 09:31:53,997] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384
18%|███████████████████████████████████▋ | 3/17 [00:06<00:26, 1.91s/it][2023-11-27 09:31:55,567] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192
41%|███████████████████████████████████████████████████████████████████████████████████▏ | 7/17 [00:12<00:16, 1.70s/it][2023-11-27 09:32:02,210] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096
47%|███████████████████████████████████████████████████████████████████████████████████████████████ | 8/17 [00:14<00:14, 1.65s/it][2023-11-27 09:32:03,747] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048
{'loss': 2.6678, 'learning_rate': 9.157348061512727e-05, 'epoch': 0.58}
82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 14/17 [00:23<00:04, 1.60s/it][2023-11-27 09:32:13,249] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:28<00:00, 1.58s/it][INFO|trainer.py:1955] 2023-11-27 09:32:16,422 >>
Training completed. Do not forget to share your model on huggingface.co/models =)
{'train_runtime': 28.4502, 'train_samples_per_second': 76.871, 'train_steps_per_second': 0.598, 'train_loss': 2.150617375093348, 'epoch': 0.99}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:28<00:00, 1.67s/it]
[INFO|configuration_utils.py:461] 2023-11-27 09:32:16,426 >> Configuration saved in ../../../sft_model/task_delegation_1127/sft_lora_model/config.json
[INFO|configuration_utils.py:564] 2023-11-27 09:32:16,426 >> Configuration saved in ../../../sft_model/task_delegation_1127/sft_lora_model/generation_config.json
[INFO|modeling_utils.py:2193] 2023-11-27 09:32:19,163 >> Model weights saved in ../../../sft_model/task_delegation_1127/sft_lora_model/pytorch_model.bin
[INFO|tokenization_utils_base.py:2428] 2023-11-27 09:32:19,164 >> tokenizer config file saved in ../../../sft_model/task_delegation_1127/sft_lora_model/tokenizer_config.json
[INFO|tokenization_utils_base.py:2437] 2023-11-27 09:32:19,164 >> Special tokens file saved in ../../../sft_model/task_delegation_1127/sft_lora_model/special_tokens_map.json
***** train metrics *****
epoch = 0.99
train_loss = 2.1506
train_runtime = 0:00:28.45
train_samples = 2187
train_samples_per_second = 76.871
train_steps_per_second = 0.598
[INFO|trainer.py:2881] 2023-11-27 09:32:26,261 >> Saving model checkpoint to ../../../sft_model/task_delegation_1127
[INFO|configuration_utils.py:461] 2023-11-27 09:32:26,262 >> Configuration saved in ../../../sft_model/task_delegation_1127/config.json
[INFO|configuration_utils.py:564] 2023-11-27 09:32:26,262 >> Configuration saved in ../../../sft_model/task_delegation_1127/generation_config.json
[INFO|modeling_utils.py:2201] 2023-11-27 09:32:41,363 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 3 checkpoint shards. You can find where each parameters has been saved in the index located at ../../../sft_model/task_delegation_1127/model.safetensors.index.json.
[INFO|tokenization_utils_base.py:2428] 2023-11-27 09:32:41,364 >> tokenizer config file saved in ../../../sft_model/task_delegation_1127/tokenizer_config.json
[INFO|tokenization_utils_base.py:2437] 2023-11-27 09:32:41,364 >> Special tokens file saved in ../../../sft_model/task_delegation_1127/special_tokens_map.json
11/27/2023 09:32:41 - INFO - __main__ - *** Evaluate ***
[INFO|trainer.py:3158] 2023-11-27 09:32:41,868 >> ***** Running Evaluation *****
[INFO|trainer.py:3160] 2023-11-27 09:32:41,868 >> Num examples = 3
[INFO|trainer.py:3163] 2023-11-27 09:32:41,868 >> Batch size = 1
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 132.25it/s]
***** eval metrics *****
epoch = 0.99
eval_loss = 1.4848
eval_runtime = 0:00:00.02
eval_samples = 3
eval_samples_per_second = 110.788
eval_steps_per_second = 36.929
perplexity = 4.4143
提交前必须检查以下项目
问题类型
None
基础模型
None
操作系统
None
详细描述问题
大大你們好, 使用你們最新的代碼來全量微調7B模型, 看了代碼後, 發現應該只要在
run_sft.sh
檔最後面加入--full_finetuning True
即可, 但加入後發現並沒有儲存最終訓練的模型檔, 所以在run_clm_sft_with_peft.py
training的步驟加入trainer.save_model()
但發現最後儲存的是safetensor的檔案而不是以往的.bin檔案, 想請問該怎麼解決? 謝謝您們!!
依赖情况(代码类问题务必提供)
运行日志或截图