Open LWL-cpu opened 7 months ago
你好,前段时间处在期末季,未及时回复请你谅解!请问是在运行bash ./scripts/train_kairos.sh
时遇到的问题吗?我刚刚尝试了一次,可以正常训练并得到checkpoint,你遇到的问题可能是由于某些依赖的版本不一致导致的,请先确保以下依赖的版本正确:
另外,几个bash
文件的正确运行顺序是:train_kairos.sh->calibrate.sh->test_kairos.sh
,我将在README
中进行修改。如仍有问题,请再告诉我,谢谢!
我在运行代码时报了这个错误。我按照你给的代码运行,请为是哪里出问题了呢。
File "/home/nlp/lwl/project/NLP/simple_to_complex/train.py", line 140, in
main()
File "/home/nlp/lwl/project/NLP/simple_to_complex/train.py", line 136, in main
trainer.fit(model, datamodule=dm)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 444, in fit
results = self.accelerator_backend.train()
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py", line 63, in train
results = self.train_or_test()
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
results = self.trainer.train()
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 493, in train
self.train_loop.run_training_epoch()
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 561, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 728, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 469, in optimizer_step
self.trainer.accelerator_backend.optimizer_step(
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 122, in optimizer_step
model_ref.optimizer_step(
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1270, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
return wrapped(*args, kwargs)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/torch/optim/optimizer.py", line 385, in wrapper
out = func(*args, *kwargs)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(args, kwargs)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/transformers/optimization.py", line 549, in step
loss = closure()
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 718, in train_step_and_backward_closure
result = self.training_step_and_backward(
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 823, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 843, in backward
result.closure_loss = self.trainer.accelerator_backend.backward(
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 103, in backward
model.backward(closure_loss, optimizer, opt_idx, *args, *kwargs)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1152, in backward
loss.backward(args, **kwargs)
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/home/nlp/anaconda3/envs/simple_EAE/lib/python3.9/site-packages/torch/autograd/init.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn