Open Lihwnlp opened 3 months ago
我也遇到了类似的情况,加了resume_from_checkpoint后运行,我看到打印出来有一行写 Checkpoint /home/a/Desktop/Llama3-Chinese-8B-Lora-train2/checkpoint-40/adapter_model.bin not found 然后确实在检查点文件夹中pytorch_model.bin或adapter_model.bin这个文件,然后就重新开始了。 ` if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: resume_from_checkpoint = training_args.resume_from_checkpoint checkpoint_name = os.path.join(resume_from_checkpoint, "pytorch_model.bin") if not os.path.exists(checkpoint_name): checkpoint_name = os.path.join( resume_from_checkpoint, "adapter_model.bin" ) # only LoRA model - LoRA config above has to fit resume_from_checkpoint = ( False # So the trainer won't try loading its state )
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
adapters_weights = torch.load(checkpoint_name)
set_peft_model_state_dict(model, adapters_weights)
else:
print(f"Checkpoint {checkpoint_name} not found")
# checkpoint = Fa
elif last_checkpoint is not None:
checkpoint = last_checkpoint
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
train_result = trainer.train(resume_from_checkpoint=checkpoint)`
finetune_clm_lora.py[627:653] 感觉像是一个bug
试了一下,参考了finetune_clm.py关于重新训练的写法,似乎是成功了,你试试看。 ···python
if training_args.do_train:
checkpoint = None
'''if training_args.resume_from_checkpoint is not None:
resume_from_checkpoint = training_args.resume_from_checkpoint
checkpoint_name = os.path.join(resume_from_checkpoint, "pytorch_model.bin")
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
adapters_weights = torch.load(checkpoint_name)
set_peft_model_state_dict(model, adapters_weights)
else:
print(f"Checkpoint {checkpoint_name} not found")
# checkpoint = Fa'''
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
请问如何从上一个检查点继续训练呢?我的上一个检查点是checkpoint-585,但是我加了--resume_from_checkpoint ${output_model}/checkpoint-585这个参数后,还是会从头开始训练(加或者不加都是从头开始训练),😵, 下边是我的脚本: deepspeed --include localhost:0,1,2,3 finetune_clm_lora.py \ --model_name_or_path /HOME/ \ --train_files /HOME/ \ --validation_files /HOME/\ --output_dir /HOME/ \ --per_device_train_batch_size 16 \ --per_device_eval_batch_size 16 \ --do_train \ --do_eval \ --use_fast_tokenizer false \ --output_dir ${output_model} \ --evaluation_strategy steps \ --max_eval_samples 800 \ --learning_rate 2.0e-4 \ --gradient_accumulation_steps 8 \ --num_train_epochs 5 \ --warmup_steps 0 \ --load_in_bits 4 \ --lora_r 8 \ --lora_alpha 32 \ --target_modules q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj \ --logging_dir ${output_model}/logs \ --logging_strategy steps \ --logging_steps 500 \ --save_strategy epoch \ --preprocessing_num_workers 16 \ --save_steps 500 \ --eval_steps 500 \ --save_total_limit 2000 \ --seed 42 \ --disable_tqdm false \ --ddp_find_unused_parameters false \ --block_size 2048 \ --report_to tensorboard \ --overwrite_output_dir \ --deepspeed ds_config_zero2.json \ --ignore_data_skip true \ --fp16 \ --gradient_checkpointing \ --fp16_full_eval \ --ddp_timeout 18000000 \ --resume_from_checkpoint ${output_model}/checkpoint-585
下边是加了resume_from_checkpoint 的log: [INFO|trainer.py:1969] 2024-07-22 09:33:14,086 >> Running training [INFO|trainer.py:1970] 2024-07-22 09:33:14,086 >> Num examples = 100,000 [INFO|trainer.py:1971] 2024-07-22 09:33:14,086 >> Num Epochs = 3 [INFO|trainer.py:1972] 2024-07-22 09:33:14,086 >> Instantaneous batch size per device = 16 [INFO|trainer.py:1975] 2024-07-22 09:33:14,086 >> Total train batch size (w. parallel, distributed & accumulation) = 512 [INFO|trainer.py:1976] 2024-07-22 09:33:14,086 >> Gradient Accumulation steps = 8 [INFO|trainer.py:1977] 2024-07-22 09:33:14,086 >> Total optimization steps = 585 [INFO|trainer.py:1978] 2024-07-22 09:33:14,090 >> Number of trainable parameters = 20,971,520 0%| | 0/585 [00:00<?, ?it/s]