~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
142 def start_training(self, trainer: 'pl.Trainer') -> None:
143 # double dispatch to initiate the training loop
--> 144 self._results = trainer.run_stage()
145
146 def start_evaluating(self, trainer: 'pl.Trainer') -> None:
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
805 if self.predicting:
806 return self.run_predict()
--> 807 return self.run_train()
808
809 def _pre_training_routine(self):
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
867 with self.profiler.profile("run_training_epoch"):
868 # run train epoch
--> 869 self.train_loop.run_training_epoch()
870
871 if self.max_steps and self.max_steps <= self.global_step:
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
582 if should_check_val:
583 self.trainer.validating = True
--> 584 self.trainer.run_evaluation(on_epoch=True)
585 self.trainer.training = True
586
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in call_hook(self, hook_name, *args, *kwargs)
1221 if hasattr(self, hook_name):
1222 trainer_hook = getattr(self, hook_name)
-> 1223 trainer_hook(args, **kwargs)
1224
1225 # next call hook in lightningModule
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py in on_validation_end(self)
225 """Called when the validation loop ends."""
226 for callback in self.callbacks:
--> 227 callback.on_validation_end(self, self.lightning_module)
228
229 def on_test_start(self):
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in on_validation_end(self, trainer, pl_module)
247 if skip:
248 return
--> 249 self.save_checkpoint(trainer)
250
251 def on_save_checkpoint(
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in save_checkpoint(self, trainer, unused)
296 # here we call each mode sequentially
297 # Mode 1: save the top k checkpoints
--> 298 self._save_top_k_checkpoint(trainer, monitor_candidates)
299 # Mode 2: save monitor=None checkpoints
300 self._save_none_monitor_checkpoint(trainer, monitor_candidates)
When I execute the training on hi-mia I get the following exception (noting that I removed some speakers from the dataset):
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule) 456 ) 457 --> 458 self._run(model) 459 460 assert self.state.stopped
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model) 754 755 # dispatch
start_training
orstart_evaluating
orstart_predicting
--> 756 self.dispatch() 757 758 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in dispatch(self) 795 self.accelerator.start_predicting(self) 796 else: --> 797 self.accelerator.start_training(self) 798 799 def run_stage(self):
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer) 94 95 def start_training(self, trainer: 'pl.Trainer') -> None: ---> 96 self.training_type_plugin.start_training(trainer) 97 98 def start_evaluating(self, trainer: 'pl.Trainer') -> None:
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer) 142 def start_training(self, trainer: 'pl.Trainer') -> None: 143 # double dispatch to initiate the training loop --> 144 self._results = trainer.run_stage() 145 146 def start_evaluating(self, trainer: 'pl.Trainer') -> None:
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self) 805 if self.predicting: 806 return self.run_predict() --> 807 return self.run_train() 808 809 def _pre_training_routine(self):
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self) 867 with self.profiler.profile("run_training_epoch"): 868 # run train epoch --> 869 self.train_loop.run_training_epoch() 870 871 if self.max_steps and self.max_steps <= self.global_step:
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self) 582 if should_check_val: 583 self.trainer.validating = True --> 584 self.trainer.run_evaluation(on_epoch=True) 585 self.trainer.training = True 586
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_evaluation(self, on_epoch) 1004 1005 # hook -> 1006 self.evaluation_loop.on_evaluation_end() 1007 1008 # log epoch metrics
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/evaluation_loop.py in on_evaluation_end(self, *args, kwargs) 100 self.trainer.call_hook('on_test_end', *args, *kwargs) 101 else: --> 102 self.trainer.call_hook('on_validation_end', args, kwargs) 103 104 if self.trainer.state.fn != TrainerFn.FITTING:
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in call_hook(self, hook_name, *args, *kwargs) 1221 if hasattr(self, hook_name): 1222 trainer_hook = getattr(self, hook_name) -> 1223 trainer_hook(args, **kwargs) 1224 1225 # next call hook in lightningModule
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py in on_validation_end(self) 225 """Called when the validation loop ends.""" 226 for callback in self.callbacks: --> 227 callback.on_validation_end(self, self.lightning_module) 228 229 def on_test_start(self):
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in on_validation_end(self, trainer, pl_module) 247 if skip: 248 return --> 249 self.save_checkpoint(trainer) 250 251 def on_save_checkpoint(
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in save_checkpoint(self, trainer, unused) 296 # here we call each mode sequentially 297 # Mode 1: save the top k checkpoints --> 298 self._save_top_k_checkpoint(trainer, monitor_candidates) 299 # Mode 2: save monitor=None checkpoints 300 self._save_none_monitor_checkpoint(trainer, monitor_candidates)
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in _save_top_k_checkpoint(self, trainer, monitor_candidates) 667 668 if self.check_monitor_top_k(trainer, current): --> 669 self._update_best_and_save(current, trainer, monitor_candidates) 670 elif self.verbose: 671 epoch = monitor_candidates.get("epoch")
~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in _update_best_and_save(self, current, trainer, monitor_candidates) 718 719 _op = min if self.mode == "min" else max --> 720 self.best_model_path = _op(self.best_k_models, key=self.best_k_models.get) 721 self.best_model_score = self.best_k_models[self.best_model_path] 722
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!