Resuming training from checkpoint with auto-resume option doesn't work for all4one. Tried with simclr, and it works. I'm using version 1.0.7 where I added manually all4one (without movingg to lighting)
Error executing job with overrides: []
Traceback (most recent call last):
File "/leonardo_scratch/large/userexternal/tceccone/solo-learn-radio/main_pretrain_custom.py", line 157, in main
trainer.fit(model, train_loader, val_loader, ckpt_path=ckpt_path)
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 532, in fit
call._call_and_handle_interrupt(
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, *kwargs)
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(args, **kwargs)
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 571, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 946, in _run
self._checkpoint_connector._restore_modules_and_callbacks(ckpt_path)
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 400, in _restore_modules_and_callbacks
self.restore_model()
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 280, in restore_model
trainer.strategy.load_model_state_dict(self._loaded_checkpoint)
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 364, in load_model_state_dict
self.lightning_module.load_state_dict(checkpoint["state_dict"])
File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1671, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for All4One:
Unexpected key(s) in state_dict: "pos_enc.penc.penc.cached_penc".
Resuming training from checkpoint with auto-resume option doesn't work for all4one. Tried with simclr, and it works. I'm using version 1.0.7 where I added manually all4one (without movingg to lighting)
Error executing job with overrides: [] Traceback (most recent call last): File "/leonardo_scratch/large/userexternal/tceccone/solo-learn-radio/main_pretrain_custom.py", line 157, in main trainer.fit(model, train_loader, val_loader, ckpt_path=ckpt_path) File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 532, in fit call._call_and_handle_interrupt( File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, *kwargs) File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch return function(args, **kwargs) File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 571, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 946, in _run self._checkpoint_connector._restore_modules_and_callbacks(ckpt_path) File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 400, in _restore_modules_and_callbacks self.restore_model() File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 280, in restore_model trainer.strategy.load_model_state_dict(self._loaded_checkpoint) File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 364, in load_model_state_dict self.lightning_module.load_state_dict(checkpoint["state_dict"]) File "/leonardo_scratch/large/userexternal/tceccone/radiovenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1671, in load_state_dict raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( RuntimeError: Error(s) in loading state_dict for All4One: Unexpected key(s) in state_dict: "pos_enc.penc.penc.cached_penc".