camlab-bioml / starling

Segmentation error aware clustering for multiplexed imaging
https://camlab-bioml.github.io/starling/
Other
11 stars 2 forks source link

cannot use cpu? #46

Open SarahAsbury opened 3 months ago

SarahAsbury commented 3 months ago
st.train_and_fit(
    callbacks=[cb_early_stopping],
    accelerator = "cpu",
    logger=[log_tb],
)

Generates error: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Full error:


GPU available: True (cuda), used: False TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs

| Name | Type | Params


0 Trainable params 0 Non-trainable params 0 Total params 0.000 Total estimated model params size (MB) Epoch 0: 0%| | 0/27 [00:00<?, ?it/s]

RuntimeError Traceback (most recent call last) Cell In[33], line 3 1 ## train ST ----> 3 st.train_and_fit( 4 callbacks=[cb_early_stopping], 5 accelerator = "cpu", 6 logger=[log_tb], 7 )

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/starling.py:353, in ST.train_and_fit(self, accelerator, strategy, devices, num_nodes, precision, logger, callbacks, fast_dev_run, max_epochs, min_epochs, max_steps, min_steps, max_time, limit_train_batches, limit_val_batches, limit_test_batches, limit_predict_batches, overfit_batches, val_check_interval, check_val_every_n_epoch, num_sanity_val_steps, log_every_n_steps, enable_checkpointing, enable_progress_bar, enable_model_summary, accumulate_grad_batches, gradient_clip_val, gradient_clip_algorithm, deterministic, benchmark, inference_mode, use_distributed_sampler, profiler, detect_anomaly, barebones, plugins, sync_batchnorm, reload_dataloaders_every_n_epochs, default_root_dir) 349 _locals.pop("self") 351 trainer = pl.Trainer(**_locals) --> 353 trainer.fit(self)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:545, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 543 self.state.status = TrainerStatus.RUNNING 544 self.training = True --> 545 call._call_and_handle_interrupt( 546 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path 547 )

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:44, in _call_and_handle_interrupt(trainer, trainer_fn, *args, kwargs) 42 if trainer.strategy.launcher is not None: 43 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, *kwargs) ---> 44 return trainer_fn(args, kwargs) 46 except _TunerExitException: 47 _call_teardown_hook(trainer)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:581, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 574 assert self.state.fn is not None 575 ckpt_path = self._checkpoint_connector._select_ckpt_path( 576 self.state.fn, 577 ckpt_path, 578 model_provided=True, 579 model_connected=self.lightning_module is not None, 580 ) --> 581 self._run(model, ckpt_path=ckpt_path) 583 assert self.state.stopped 584 self.training = False

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:990, in Trainer._run(self, model, ckpt_path) 985 self._signal_connector.register_signal_handlers() 987 # ---------------------------- 988 # RUN THE TRAINER 989 # ---------------------------- --> 990 results = self._run_stage() 992 # ---------------------------- 993 # POST-Training CLEAN UP 994 # ---------------------------- 995 log.debug(f"{self.class.name}: trainer tearing down")

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:1036, in Trainer._run_stage(self) 1034 self._run_sanity_check() 1035 with torch.autograd.set_detect_anomaly(self._detect_anomaly): -> 1036 self.fit_loop.run() 1037 return None 1038 raise RuntimeError(f"Unexpected state {self.state}")

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:202, in _FitLoop.run(self) 200 try: 201 self.on_advance_start() --> 202 self.advance() 203 self.on_advance_end() 204 self._restarting = False

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:359, in _FitLoop.advance(self) 357 with self.trainer.profiler.profile("run_training_epoch"): 358 assert self._data_fetcher is not None --> 359 self.epoch_loop.run(self._data_fetcher)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py:136, in _TrainingEpochLoop.run(self, data_fetcher) 134 while not self.done: 135 try: --> 136 self.advance(data_fetcher) 137 self.on_advance_end(data_fetcher) 138 self._restarting = False

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py:240, in _TrainingEpochLoop.advance(self, data_fetcher) 237 with trainer.profiler.profile("run_training_batch"): 238 if trainer.lightning_module.automatic_optimization: 239 # in automatic optimization, there can only be one optimizer --> 240 batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) 241 else: 242 batch_output = self.manual_optimization.run(kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:187, in _AutomaticOptimization.run(self, optimizer, batch_idx, kwargs) 180 closure() 182 # ------------------------------ 183 # BACKWARD PASS 184 # ------------------------------ 185 # gradient update with accumulated gradients 186 else: --> 187 self._optimizer_step(batch_idx, closure) 189 result = closure.consume_result() 190 if result.loss is None:

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:265, in _AutomaticOptimization._optimizer_step(self, batch_idx, train_step_and_backward_closure) 262 self.optim_progress.optimizer.step.increment_ready() 264 # model hook --> 265 call._call_lightning_module_hook( 266 trainer, 267 "optimizer_step", 268 trainer.current_epoch, 269 batch_idx, 270 optimizer, 271 train_step_and_backward_closure, 272 ) 274 if not should_accumulate: 275 self.optim_progress.optimizer.step.increment_completed()

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:157, in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, *kwargs) 154 pl_module._current_fx_name = hook_name 156 with trainer.profiler.profile(f"[LightningModule]{pl_module.class.name}.{hook_name}"): --> 157 output = fn(args, **kwargs) 159 # restore current_fx when nested context 160 pl_module._current_fx_name = prev_fx_name

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/core/module.py:1282, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure) 1243 def optimizer_step( 1244 self, 1245 epoch: int, (...) 1248 optimizer_closure: Optional[Callable[[], Any]] = None, 1249 ) -> None: 1250 r"""Override this method to adjust the default way the :class:~pytorch_lightning.trainer.trainer.Trainer calls 1251 the optimizer. 1252 (...) 1280 1281 """ -> 1282 optimizer.step(closure=optimizer_closure)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py:151, in LightningOptimizer.step(self, closure, kwargs) 148 raise MisconfigurationException("When optimizer.step(closure) is called, the closure should be callable") 150 assert self._strategy is not None --> 151 step_output = self._strategy.optimizer_step(self._optimizer, closure, kwargs) 153 self._on_after_step() 155 return step_output

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py:230, in Strategy.optimizer_step(self, optimizer, closure, model, kwargs) 228 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed 229 assert isinstance(model, pl.LightningModule) --> 230 return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:117, in PrecisionPlugin.optimizer_step(self, optimizer, model, closure, kwargs) 115 """Hook to run the optimizer step.""" 116 closure = partial(self._wrap_closure, model, optimizer, closure) --> 117 return optimizer.step(closure=closure, kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/optim/optimizer.py:113, in Optimizer._hook_for_profile..profile_hook_step..wrapper(*args, *kwargs) 111 profile_name = "Optimizer.step#{}.step".format(obj.class.name) 112 with torch.autograd.profiler.record_function(profile_name): --> 113 return func(args, **kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.call..decorate_context(*args, kwargs) 24 @functools.wraps(func) 25 def decorate_context(*args, *kwargs): 26 with self.clone(): ---> 27 return func(args, kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/optim/adam.py:118, in Adam.step(self, closure) 116 if closure is not None: 117 with torch.enable_grad(): --> 118 loss = closure() 120 for group in self.param_groups: 121 params_with_grad = []

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:104, in PrecisionPlugin._wrap_closure(self, model, optimizer, closure) 91 def _wrap_closure( 92 self, 93 model: "pl.LightningModule", 94 optimizer: Optimizer, 95 closure: Callable[[], Any], 96 ) -> Any: 97 """This double-closure allows makes sure the closure is executed before the on_before_optimizer_step 98 hook is called. 99 (...) 102 103 """ --> 104 closure_result = closure() 105 self._after_closure(model, optimizer) 106 return closure_result

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:140, in Closure.call(self, *args, kwargs) 139 def call(self, *args: Any, *kwargs: Any) -> Optional[Tensor]: --> 140 self._result = self.closure(args, kwargs) 141 return self._result.loss

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.call..decorate_context(*args, kwargs) 24 @functools.wraps(func) 25 def decorate_context(*args, *kwargs): 26 with self.clone(): ---> 27 return func(args, kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:126, in Closure.closure(self, *args, *kwargs) 124 @torch.enable_grad() 125 def closure(self, args: Any, **kwargs: Any) -> ClosureResult: --> 126 step_output = self._step_fn() 128 if step_output.closure_loss is None: 129 self.warning_cache.warn("training_step returned None. If this was on purpose, ignore this warning...")

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:315, in _AutomaticOptimization._training_step(self, kwargs) 312 trainer = self.trainer 314 # manually capture logged metrics --> 315 training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) 316 self.trainer.strategy.post_training_step() # unused hook - call anyway for backward compatibility 318 return self.output_result_cls.from_training_step_output(training_step_output, trainer.accumulate_grad_batches)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:309, in _call_strategy_hook(trainer, hook_name, *args, *kwargs) 306 return None 308 with trainer.profiler.profile(f"[Strategy]{trainer.strategy.class.name}.{hook_name}"): --> 309 output = fn(args, **kwargs) 311 # restore current_fx when nested context 312 pl_module._current_fx_name = prev_fx_name

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py:382, in Strategy.training_step(self, *args, kwargs) 380 if self.model != self.lightning_module: 381 return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, *kwargs) --> 382 return self.lightning_module.training_step(args, kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/starling.py:121, in ST.training_step(self, batch) 114 """Compute and return the training loss 115 116 :param batch: A list of tensors of size m x n 117 118 :returns: Total loss 119 """ 120 # y, s, fy, fs, fl = batch --> 121 model_nll, fake_loss, p_fake_singlet = self(batch) 123 # total loss 124 loss = model_nll + self.model_regularizer * fake_loss

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, *kwargs) 1126 # If we don't have any hooks, we want to skip the rest of the logic in 1127 # this function, and just call forward. 1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1129 or _global_forward_hooks or _global_forward_pre_hooks): -> 1130 return forward_call(input, **kwargs) 1131 # Do not call functions when jit is used 1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/starling.py:94, in ST.forward(self, batch) 92 if self.model_cellsize: 93 y, s, fy, fs, fl = batch ---> 94 , _, modelnll, = utility.compute_posteriors( 95 y, s, self.model_params, self.dist_option, self.model_zplaneoverlap 96 ) 97 , , , p_fake_singlet = utility.compute_posteriors( 98 fy, fs, self.model_params, self.dist_option, self.model_zplane_overlap 99 ) 100 else:

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/utility.py:435, in compute_posteriors(Y, S, Theta, dist_option, model_overlap) 428 log_tau = torch.nn.functional.log_softmax( 429 Theta["is_tau"].reshape(-1), dim=0 430 ).reshape( 431 log_pi.shape[0], log_pi.shape[0] 432 ) ## CxC 433 log_delta = torch.nn.functional.log_softmax(Theta["is_delta"], dim=0) ## 2 --> 435 prob_y_given_z = compute_p_y_given_z( 436 Y, Theta, dist_option 437 ) ## log p(y_n|z=c) -> NxC 438 prob_data_given_z_d0 = ( 439 prob_y_given_z + log_pi 440 ) ## log p(y_n|z=c) + log p(z=c) -> NxC + C -> NxC 442 if S is not None:

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/utility.py:325, in compute_p_y_given_z(Y, Theta, dist_option) 322 else: 323 dist_Y = torch.distributions.StudentT(df=2, loc=mu, scale=sigma) --> 325 return dist_Y.log_prob(Y.reshape(-1, 1, Y.shape[1])).sum( 326 2 327 )

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/distributions/studentT.py:82, in StudentT.log_prob(self, value) 80 if self._validate_args: 81 self._validate_sample(value) ---> 82 y = (value - self.loc) / self.scale 83 Z = (self.scale.log() + 84 0.5 self.df.log() + 85 0.5 math.log(math.pi) + 86 torch.lgamma(0.5 self.df) - 87 torch.lgamma(0.5 (self.df + 1.))) 88 return -0.5 (self.df + 1.) torch.log1p(y**2. / self.df) - Z

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!