huggingface / transformers

🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.
https://huggingface.co/transformers
Apache License 2.0
132k stars 26.29k forks source link

ValueError: No valid checkpoint found in output directory #15341

Closed xiaolang01 closed 2 years ago

xiaolang01 commented 2 years ago

import sys sys.path.append("./NeZha_Chinese_PyTorch-main/") from transformers import BertTokenizer, WEIGHTS_NAME,TrainingArguments from model.modeling_nezha import NeZhaForMaskedLM from model.configuration_nezha import NeZhaConfig from transformers import ( DataCollatorForLanguageModeling, Trainer, TrainingArguments, LineByLineTextDataset ) from transformers import BertTokenizer

tokenizer = BertTokenizer(vocab_file='./vocab.txt',do_lower_case=False,do_basic_tokenize=False)

tokenizer = BertTokenizer.from_pretrained('./vocab.txt',do_lower_case=False,do_basic_tokenize=False) model_path='./nezha-cn-base/' config=NeZhaConfig.from_pretrained(model_path) model=NeZhaForMaskedLM.from_pretrained(model_path, config=config)# model.resize_token_embeddings(len(tokenizer)) train_dataset=LineByLineTextDataset(tokenizer=tokenizer,file_path='../data/bert_data/mlm_data/train.txt',block_size=128)

MLM模型的数据DataCollator

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

训练参数

pretrain_batch_size=64 num_train_epochs=300 training_args = TrainingArguments( output_dir='./outputs/', overwrite_output_dir=True, num_train_epochs=num_train_epochs, learning_rate=6e-5, per_device_train_batch_size=pretrain_batch_size, save_steps=10000,save_total_limit=10)#

通过Trainer接口训练模型

trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) trainer.train(True)

ValueError Traceback (most recent call last)

in 29 trainer = Trainer( 30 model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) ---> 31 trainer.train(True) D:\Program Files (x86)\Anconda3\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1088 resume_from_checkpoint = get_last_checkpoint(args.output_dir) 1089 if resume_from_checkpoint is None: -> 1090 raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") 1091 1092 if resume_from_checkpoint is not None: ValueError: No valid checkpoint found in output directory (./outputs/)
github-actions[bot] commented 2 years ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

wallon-ai commented 2 years ago

import sys sys.path.append("./NeZha_Chinese_PyTorch-main/") from transformers import BertTokenizer, WEIGHTS_NAME,TrainingArguments from model.modeling_nezha import NeZhaForMaskedLM from model.configuration_nezha import NeZhaConfig from transformers import ( DataCollatorForLanguageModeling, Trainer, TrainingArguments, LineByLineTextDataset ) from transformers import BertTokenizer

tokenizer = BertTokenizer(vocab_file='./vocab.txt',do_lower_case=False,do_basic_tokenize=False)

tokenizer = BertTokenizer.from_pretrained('./vocab.txt',do_lower_case=False,do_basic_tokenize=False) model_path='./nezha-cn-base/' config=NeZhaConfig.from_pretrained(model_path) model=NeZhaForMaskedLM.from_pretrained(model_path, config=config)# model.resize_token_embeddings(len(tokenizer)) train_dataset=LineByLineTextDataset(tokenizer=tokenizer,file_path='../data/bert_data/mlm_data/train.txt',block_size=128)

MLM模型的数据DataCollator

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

训练参数

pretrain_batch_size=64 num_train_epochs=300 training_args = TrainingArguments( output_dir='./outputs/', overwrite_output_dir=True, num_train_epochs=num_train_epochs, learning_rate=6e-5, per_device_train_batch_size=pretrain_batch_size, save_steps=10000,save_total_limit=10)#

通过Trainer接口训练模型

trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) trainer.train(True)

ValueError Traceback (most recent call last) in 29 trainer = Trainer( 30 model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) ---> 31 trainer.train(True)

D:\Program Files (x86)\Anconda3\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1088 resume_from_checkpoint = get_last_checkpoint(args.output_dir) 1089 if resume_from_checkpoint is None: -> 1090 raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") 1091 1092 if resume_from_checkpoint is not None:

ValueError: No valid checkpoint found in output directory (./outputs/)

I got the same error. But I do not know how to fix it.

xiaolang01 commented 2 years ago

import sys sys.path.append("./NeZha_Chinese_PyTorch-main/") from transformers import BertTokenizer, WEIGHTS_NAME,TrainingArguments from model.modeling_nezha import NeZhaForMaskedLM from model.configuration_nezha import NeZhaConfig from transformers import ( DataCollatorForLanguageModeling, Trainer, TrainingArguments, LineByLineTextDataset ) from transformers import BertTokenizer

tokenizer = BertTokenizer(vocab_file='./vocab.txt',do_lower_case=False,do_basic_tokenize=False)

tokenizer = BertTokenizer.from_pretrained('./vocab.txt',do_lower_case=False,do_basic_tokenize=False) model_path='./nezha-cn-base/' config=NeZhaConfig.from_pretrained(model_path) model=NeZhaForMaskedLM.from_pretrained(model_path, config=config)# model.resize_token_embeddings(len(tokenizer)) train_dataset=LineByLineTextDataset(tokenizer=tokenizer,file_path='../data/bert_data/mlm_data/train.txt',block_size=128)

MLM模型的数据DataCollator

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

训练参数

pretrain_batch_size=64 num_train_epochs=300 training_args = TrainingArguments( output_dir='./outputs/', overwrite_output_dir=True, num_train_epochs=num_train_epochs, learning_rate=6e-5, per_device_train_batch_size=pretrain_batch_size, save_steps=10000,save_total_limit=10)#

通过Trainer接口训练模型

trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) trainer.train(True) ValueError Traceback (most recent call last) in 29 trainer = Trainer( 30 model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) ---> 31 trainer.train(True) D:\Program Files (x86)\Anconda3\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1088 resume_from_checkpoint = get_last_checkpoint(args.output_dir) 1089 if resume_from_checkpoint is None: -> 1090 raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") 1091 1092 if resume_from_checkpoint is not None: ValueError: No valid checkpoint found in output directory (./outputs/)

I got the same error. But I do not know how to fix it.

trainer.train(True) >>> trainer.train() 把这句改成这个就可以了

github-actions[bot] commented 2 years ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.