Thanks for your excellent work! I followed the README to train MaskPLS. However, I encountered the following error during the first iteration.
I have checked outputs, loss_dict, total_loss, and they all seem to be fine.
My environment is torch==1.10.1+cu111, and the rest are the same as in the requirement.txt file.
Could you please help me understand how this problem arose and how to solve it? Thanks in advance!
Traceback (most recent call last):
File "scripts/train_model.py", line 81, in <module>
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "scripts/train_model.py", line 40, in main
"experiments/" + cfg.EXPERIMENT.ID, default_hp_metric=False
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 737, in fit
self._call_and_handle_interrupt(
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 682, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 772, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1194, in _run
self._dispatch()
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1274, in _dispatch
self.training_type_plugin.start_training(self)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1284, in run_stage
return self._run_train()
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1314, in _run_train
self.fit_loop.run()
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 193, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 215, in advance
result = self._run_optimization(
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 259, in _run_optimization
closure()
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 160, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 155, in closure
self._backward_fn(step_output.closure_loss)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 327, in backward_fn
self.trainer.accelerator.backward(loss, optimizer, opt_idx)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 311, in backward
self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 91, in backward
model.backward(closure_loss, optimizer, *args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1434, in backward
loss.backward(*args, **kwargs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/yangyu/envs/PS/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
Variable._execution_engine.run_backward(
RuntimeError: merge_sort: failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered
Hi!
Thanks for your excellent work! I followed the README to train MaskPLS. However, I encountered the following error during the first iteration. I have checked
outputs
,loss_dict
,total_loss
, and they all seem to be fine. My environment istorch==1.10.1+cu111
, and the rest are the same as in the requirement.txt file. Could you please help me understand how this problem arose and how to solve it? Thanks in advance!