当运行sh train.sh pre_train.py时候,我采用8卡来运行脚本,但出现
Saving model checkpoint to ./model_save/pre/tmp-checkpoint-50
Configuration saved in ./model_save/pre/tmp-checkpoint-50/config.json
Configuration saved in ./model_save/pre/tmp-checkpoint-50/generation_config.json
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50' -> './model_save/pre/checkpoint-50'
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50' -> './model_save/pre/checkpoint-50'
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2350, in _save_checkpoint
self.save_model(staging_output_dir, _internal_call=True)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2837, in save_model
self._save(output_dir)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2897, in _save
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
self.model.save_pretrained(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/modeling_utils.py", line 2352, in save_pretrained
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
for filename in os.listdir(save_directory):
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50'
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50' -> './model_save/pre/checkpoint-50'
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)
当运行sh train.sh pre_train.py时候,我采用8卡来运行脚本,但出现 Saving model checkpoint to ./model_save/pre/tmp-checkpoint-50 Configuration saved in ./model_save/pre/tmp-checkpoint-50/config.json Configuration saved in ./model_save/pre/tmp-checkpoint-50/generation_config.json Traceback (most recent call last): File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50' -> './model_save/pre/checkpoint-50'
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50' -> './model_save/pre/checkpoint-50'
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2350, in _save_checkpoint
self.save_model(staging_output_dir, _internal_call=True)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2837, in save_model
self._save(output_dir)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2897, in _save
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
self.model.save_pretrained(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/modeling_utils.py", line 2352, in save_pretrained
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
for filename in os.listdir(save_directory):
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50'
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: './model_save/pre/tmp-checkpoint-50' -> './model_save/pre/checkpoint-50'
Traceback (most recent call last):
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 281, in
trainer.train( #'model_save/pre/checkpoint-3400'
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home/ubuntu/XZT/LLM/LLM_Project/MINI_LLM-main/pre_train.py", line 259, in _save_checkpoint
super()._save_checkpoint(model, trial, metrics)
File "/home/ubuntu/anaconda3/envs/Fun_LLM/lib/python3.9/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)