GPU support - Githubissues

baj12 commented 1 year ago

Description

I am having trouble getting the GPU to work on the example data

Reproducing the issue

import scyan
import torch
#import lightning.pytorch
import pytorch_lightning as pl
import torch
from torch import Tensor, distributions
from pathlib import Path
import pandas as pd
import anndata
import numpy as np

adata, table = scyan.data.load("aml") # Automatic loading

model = scyan.Scyan(adata, table)

model.fit(trainer=pl.Trainer(accelerator='gpu', devices=1))

I am getting the following error message:

To me, this is an issue of setting up the trainer and is more related to lightning, but I hope that you have encountered this before and know where this problem stems from ... since the GPU is available and used but it complains that the tensors should be on the same device. I have no clue how to change this...

Thanks,

Bernd

Error message

``` GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs [INFO] (scyan.model) Training scyan with the following hyperparameters: "batch_key": None "batch_size": 8192 "hidden_size": 16 "lr": 0.0005 "max_samples": 200000 "modulo_temp": 3 "n_hidden_layers": 6 "n_layers": 7 "prior_std": 0.3 "temperature": 0.5 "warm_up": (0.35, 4) LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] | Name | Type | Params --------------------------------------- 0 | module | ScyanModule | 29.6 K --------------------------------------- 29.6 K Trainable params 0 Non-trainable params 29.6 K Total params 0.118 Total estimated model params size (MB) --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[5], line 1 ----> 1 model.fit(trainer=pl.Trainer(accelerator='gpu', devices=1)) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:551, in Scyan.fit(self, max_epochs, min_delta, patience, num_workers, log_every_n_steps, callbacks, logger, enable_checkpointing, trainer, **trainer_args) 541 log_every_n_steps = min(log_every_n_steps, len(self.x) // self._batch_size) 542 trainer = pl.Trainer( 543 max_epochs=max_epochs, 544 callbacks=[esc] + (callbacks or []), (...) 548 **trainer_args, 549 ) --> 551 trainer.fit(self) 553 self._is_fitted = True 554 log.info("Successfully ended training.") File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:608, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 606 model = self._maybe_unwrap_optimized(model) 607 self.strategy._lightning_module = model --> 608 call._call_and_handle_interrupt( 609 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path 610 ) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:38, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs) 36 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) 37 else: ---> 38 return trainer_fn(*args, **kwargs) 40 except _TunerExitException: 41 trainer._call_teardown_hook() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:650, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 643 ckpt_path = ckpt_path or self.resume_from_checkpoint 644 self._ckpt_path = self._checkpoint_connector._set_ckpt_path( 645 self.state.fn, 646 ckpt_path, # type: ignore[arg-type] 647 model_provided=True, 648 model_connected=self.lightning_module is not None, 649 ) --> 650 self._run(model, ckpt_path=self.ckpt_path) 652 assert self.state.stopped 653 self.training = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1112, in Trainer._run(self, model, ckpt_path) 1108 self._checkpoint_connector.restore_training_state() 1110 self._checkpoint_connector.resume_end() -> 1112 results = self._run_stage() 1114 log.detail(f"{self.__class__.__name__}: trainer tearing down") 1115 self._teardown() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1191, in Trainer._run_stage(self) 1189 if self.predicting: 1190 return self._run_predict() -> 1191 self._run_train() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1214, in Trainer._run_train(self) 1211 self.fit_loop.trainer = self 1213 with torch.autograd.set_detect_anomaly(self._detect_anomaly): -> 1214 self.fit_loop.run() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:267, in FitLoop.advance(self) 265 self._data_fetcher.setup(dataloader, batch_to_device=batch_to_device) 266 with self.trainer.profiler.profile("run_training_epoch"): --> 267 self._outputs = self.epoch_loop.run(self._data_fetcher) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py:213, in TrainingEpochLoop.advance(self, data_fetcher) 210 self.batch_progress.increment_started() 212 with self.trainer.profiler.profile("run_training_batch"): --> 213 batch_output = self.batch_loop.run(kwargs) 215 self.batch_progress.increment_processed() 217 # update non-plateau LR schedulers 218 # update epoch-interval ones only when we are at the end of training epoch File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py:88, in TrainingBatchLoop.advance(self, kwargs) 84 if self.trainer.lightning_module.automatic_optimization: 85 optimizers = _get_active_optimizers( 86 self.trainer.optimizers, self.trainer.optimizer_frequencies, kwargs.get("batch_idx", 0) 87 ) ---> 88 outputs = self.optimizer_loop.run(optimizers, kwargs) 89 else: 90 outputs = self.manual_loop.run(kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:202, in OptimizerLoop.advance(self, optimizers, kwargs) 199 def advance(self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict) -> None: 200 kwargs = self._build_kwargs(kwargs, self.optimizer_idx, self._hiddens) --> 202 result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) 203 if result.loss is not None: 204 # automatic optimization assumes a loss needs to be returned for extras to be considered as the batch 205 # would be skipped otherwise 206 self._outputs[self.optimizer_idx] = result.asdict() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:249, in OptimizerLoop._run_optimization(self, kwargs, optimizer) 241 closure() 243 # ------------------------------ 244 # BACKWARD PASS 245 # ------------------------------ 246 # gradient update with accumulated gradients 247 else: 248 # the `batch_idx` is optional with inter-batch parallelism --> 249 self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) 251 result = closure.consume_result() 253 if result.loss is not None: 254 # if no result, user decided to skip optimization 255 # otherwise update running loss + reset accumulated loss 256 # TODO: find proper way to handle updating running loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:370, in OptimizerLoop._optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure) 362 rank_zero_deprecation( 363 "The NVIDIA/apex AMP implementation has been deprecated upstream. Consequently, its integration inside" 364 " PyTorch Lightning has been deprecated in v1.9.0 and will be removed in v2.0.0." (...) 367 " return True." 368 ) 369 kwargs["using_native_amp"] = isinstance(self.trainer.precision_plugin, MixedPrecisionPlugin) --> 370 self.trainer._call_lightning_module_hook( 371 "optimizer_step", 372 self.trainer.current_epoch, 373 batch_idx, 374 optimizer, 375 opt_idx, 376 train_step_and_backward_closure, 377 on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator), 378 **kwargs, # type: ignore[arg-type] 379 using_lbfgs=is_lbfgs, 380 ) 382 if not should_accumulate: 383 self.optim_progress.optimizer.step.increment_completed() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1356, in Trainer._call_lightning_module_hook(self, hook_name, pl_module, *args, **kwargs) 1353 pl_module._current_fx_name = hook_name 1355 with self.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"): -> 1356 output = fn(*args, **kwargs) 1358 # restore current_fx when nested context 1359 pl_module._current_fx_name = prev_fx_name File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/module.py:1754, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_lbfgs) 1675 def optimizer_step( 1676 self, 1677 epoch: int, (...) 1683 using_lbfgs: bool = False, 1684 ) -> None: 1685 r""" 1686 Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls 1687 each optimizer. (...) 1752 1753 """ -> 1754 optimizer.step(closure=optimizer_closure) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py:169, in LightningOptimizer.step(self, closure, **kwargs) 166 raise MisconfigurationException("When `optimizer.step(closure)` is called, the closure should be callable") 168 assert self._strategy is not None --> 169 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs) 171 self._on_after_step() 173 return step_output File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:234, in Strategy.optimizer_step(self, optimizer, opt_idx, closure, model, **kwargs) 232 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed 233 assert isinstance(model, pl.LightningModule) --> 234 return self.precision_plugin.optimizer_step( 235 optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs 236 ) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:119, in PrecisionPlugin.optimizer_step(self, optimizer, model, optimizer_idx, closure, **kwargs) 117 """Hook to run the optimizer step.""" 118 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure) --> 119 return optimizer.step(closure=closure, **kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:140, in Optimizer._hook_for_profile..profile_hook_step..wrapper(*args, **kwargs) 138 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__) 139 with torch.autograd.profiler.record_function(profile_name): --> 140 out = func(*args, **kwargs) 141 obj._optimizer_step_code() 142 return out File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:23, in _use_grad_for_differentiable.._use_grad(self, *args, **kwargs) 21 try: 22 torch.set_grad_enabled(self.defaults['differentiable']) ---> 23 ret = func(self, *args, **kwargs) 24 finally: 25 torch.set_grad_enabled(prev_grad) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/adam.py:183, in Adam.step(self, closure, grad_scaler) 181 if closure is not None: 182 with torch.enable_grad(): --> 183 loss = closure() 185 for group in self.param_groups: 186 params_with_grad = [] File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:105, in PrecisionPlugin._wrap_closure(self, model, optimizer, optimizer_idx, closure) 92 def _wrap_closure( 93 self, 94 model: "pl.LightningModule", (...) 97 closure: Callable[[], Any], 98 ) -> Any: 99 """This double-closure allows makes sure the ``closure`` is executed before the 100 ``on_before_optimizer_step`` hook is called. 101 102 The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is 103 consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly. 104 """ --> 105 closure_result = closure() 106 self._after_closure(model, optimizer, optimizer_idx) 107 return closure_result File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:149, in Closure.__call__(self, *args, **kwargs) 148 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: --> 149 self._result = self.closure(*args, **kwargs) 150 return self._result.loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:135, in Closure.closure(self, *args, **kwargs) 134 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: --> 135 step_output = self._step_fn() 137 if step_output.closure_loss is None: 138 self.warning_cache.warn("`training_step` returned `None`. If this was on purpose, ignore this warning...") File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:419, in OptimizerLoop._training_step(self, kwargs) 410 """Performs the actual train step with the tied hooks. 411 412 Args: (...) 416 A ``ClosureResult`` containing the training step output. 417 """ 418 # manually capture logged metrics --> 419 training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values()) 420 self.trainer.strategy.post_training_step() 422 model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1494, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs) 1491 return 1493 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"): -> 1494 output = fn(*args, **kwargs) 1496 # restore current_fx when nested context 1497 pl_module._current_fx_name = prev_fx_name File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:378, in Strategy.training_step(self, *args, **kwargs) 376 with self.precision_plugin.train_step_context(): 377 assert isinstance(self.model, TrainingStep) --> 378 return self.model.training_step(*args, **kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:306, in Scyan.training_step(self, data, _) 304 """PyTorch lightning `training_step` implementation (i.e. returning the loss). See [ScyanModule][scyan.module.ScyanModule] for more details.""" 305 use_temp = self.current_epoch % self.hparams.modulo_temp > 0 --> 306 loss = self.module.kl(*data, use_temp) 308 self.log("loss", loss, on_epoch=True, on_step=True) 310 return loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:194, in ScyanModule.kl(self, x, covariates, use_temp) 178 def kl( 179 self, 180 x: Tensor, 181 covariates: Tensor, 182 use_temp: bool, 183 ) -> Tuple[Tensor, Tensor]: 184 """Compute the module loss for one mini-batch. 185 186 Args: (...) 192 The KL loss term. 193 """ --> 194 log_probs, ldj_sum, _ = self.compute_probabilities(x, covariates, use_temp) 196 return -(torch.logsumexp(log_probs, dim=1) + ldj_sum).mean() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:174, in ScyanModule.compute_probabilities(self, x, covariates, use_temp) 166 u, _, ldj_sum = self(x, covariates) 168 log_pi = ( 169 self.log_pi_temperature(-self.hparams.temperature) 170 if use_temp 171 else self.log_pi 172 ) --> 174 log_probs = self.prior.log_prob(u) + log_pi # size N x P 176 return log_probs, ldj_sum, u File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:122, in PriorDistribution.log_prob(self, u) 113 def log_prob(self, u: Tensor) -> Tensor: 114 """Log probability per population. 115 116 Args: (...) 120 Log probabilities tensor of size $(B, P)$. 121 """ --> 122 diff = self.difference_to_modes(u) # size B x P x M 124 return self.prior_h.log_prob(diff) + self.na_constant_term File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:92, in PriorDistribution.difference_to_modes(self, u) 83 def difference_to_modes(self, u: Tensor) -> Tensor: 84 """Difference between the latent variable $U$ and all the modes (one mode per population). 85 86 Args: (...) 90 Tensor of size $(B, P, M)$ representing differences to all modes. 91 """ ---> 92 diff = u[:, None, :] - self.modes 94 diff[:, self.rho_mask] = torch.clamp( 95 diff[:, self.rho_mask].abs() - self.uniform_law_radius, min=0 96 ) # Handling NA values 98 return diff RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! ```

System

OS: linux/slurm cluster
Python version 3.9.18

Dependencies versions

Package Version ------------------------ ------------ aiohttp 3.8.6 aiosignal 1.3.1 anndata 0.8.0 asttokens 2.4.1 async-timeout 4.0.3 attrs 23.1.0 certifi 2023.7.22 charset-normalizer 3.3.1 comm 0.1.4 contourpy 1.1.1 cycler 0.12.1 debugpy 1.8.0 decorator 5.1.1 exceptiongroup 1.1.3 executing 2.0.1 fcsparser 0.2.8 fcswrite 0.6.2 FlowUtils 1.0.0 fonttools 4.43.1 frozenlist 1.4.0 fsspec 2023.10.0 h5py 3.10.0 idna 3.4 importlib-metadata 6.8.0 importlib-resources 6.1.0 ipykernel 6.26.0 ipython 8.17.2 ipywidgets 8.1.1 jedi 0.19.1 joblib 1.3.2 jupyter_client 8.5.0 jupyter_core 5.5.0 jupyterlab-widgets 3.0.9 kiwisolver 1.4.5 lightning 2.1.0 lightning-utilities 0.9.0 llvmlite 0.38.1 matplotlib 3.8.0 matplotlib-inline 0.1.6 multidict 6.0.4 natsort 8.4.0 nest-asyncio 1.5.8 numba 0.55.2 numpy 1.22.4 nvidia-cublas-cu11 11.10.3.66 nvidia-cuda-nvrtc-cu11 11.7.99 nvidia-cuda-runtime-cu11 11.7.99 nvidia-cudnn-cu11 8.5.0.96 packaging 23.2 pandas 2.1.2 parso 0.8.3 pexpect 4.8.0 Pillow 10.1.0 pip 23.3.1 platformdirs 3.11.0 prompt-toolkit 3.0.39 psutil 5.9.6 ptyprocess 0.7.0 pure-eval 0.2.2 Pygments 2.16.1 pynndescent 0.5.10 pyparsing 3.1.1 python-dateutil 2.8.2 pytorch-lightning 1.9.5 pytz 2023.3.post1 PyYAML 6.0.1 pyzmq 25.1.1 requests 2.31.0 scikit-learn 1.3.2 scipy 1.11.3 scyan 1.5.1 seaborn 0.12.2 setuptools 58.1.0 six 1.16.0 stack-data 0.6.3 tbb 2021.10.0 threadpoolctl 3.2.0 torch 1.13.1 torchmetrics 1.2.0 tornado 6.3.3 tqdm 4.66.1 traitlets 5.13.0 typing_extensions 4.8.0 tzdata 2023.3 umap-learn 0.5.4 urllib3 2.0.7 wcwidth 0.2.9 wheel 0.41.3 widgetsnbextension 4.0.9 yarl 1.9.2 zipp 3.17.0 Note: you may need to restart the kernel to use updated packages.

quentinblampey commented 1 year ago

Hello @baj12

Thanks for your interest in Scyan!

Indeed, it appears that the GPU usage was broken at some point. I made a minor change and it should now work again (just run pip install scyan --upgrade to get version 1.5.2). I checked on the same system as yours (linux/slurm cluster) and it went well, but I'll wait for your feedback before closing the issue

NB: for me CPUs are usually enough, even for 100 millions cells. Have you encoutered any slow down? Or maybe you have a very huge dataset?

baj12 commented 1 year ago

Thanks for the quick response.

sorry to say that the error persists:

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/distributions/multivariate_normal.py:59, in _batch_mahalanobis(bL, bx)
     57 flat_x = bx.reshape(-1, flat_L.size(0), n)  # shape = c x b x n
     58 flat_x_swap = flat_x.permute(1, 2, 0)  # shape = b x n x c
---> 59 M_swap = torch.linalg.solve_triangular(flat_L, flat_x_swap, upper=False).pow(2).sum(-2)  # shape = b x c
     60 M = M_swap.t()  # shape = c x b
     62 # Now we revert the above reshape and permute operators.

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument B in method wrapper__linalg_solve_triangular)

using the updated version:

scipy                    1.11.3
scyan                    1.5.2
seaborn                  0.12.2

Since we have these new GPUs I wanted to see what happens. Still trying to figure out how scyan works... ;) I am working on a rel. large Sony data set and it seem am struggling creating a reasonable knowledge table. Just found some of your example tables that will probably help. Need some time ...

baj12 commented 1 year ago

How do you define the Trainer? Are we using the same? trainer=pl.Trainer(accelerator='gpu', devices=1)

quentinblampey commented 1 year ago

Yes @baj12, this might be due to how you initialize the Trainer. In fact, model.fit already creates a Trainer: you don't have to provide one, except for very specific usages. Here, you just want to pass some kwargs to the Trainer, so you can just do model.fit(accelerator='gpu', devices=1)

you can see on the doc that the kwargs are provided to the Trainer: https://mics-lab.github.io/scyan/api/model/#scyan.model.Scyan.fit

I don't know if it will fix the issue, but at least this is cleaner. If it doesn't solve it, can you send me the full log?

baj12 commented 1 year ago

model.fit(accelerator='gpu', devices=1)

Error message

``` [INFO] (scyan.model) Training scyan with the following hyperparameters: "batch_key": None "batch_size": 8192 "hidden_size": 16 "lr": 0.0005 "max_samples": 200000 "modulo_temp": 3 "n_hidden_layers": 6 "n_layers": 7 "prior_std": 0.3 "temperature": 0.5 "warm_up": (0.35, 4) GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] | Name | Type | Params --------------------------------------- 0 | module | ScyanModule | 29.6 K --------------------------------------- 29.6 K Trainable params 0 Non-trainable params 29.6 K Total params 0.118 Total estimated model params size (MB) Ended warm up epochs --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[6], line 1 ----> 1 model.fit(accelerator='gpu', devices=1) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:551, in Scyan.fit(self, max_epochs, min_delta, patience, num_workers, log_every_n_steps, callbacks, logger, enable_checkpointing, trainer, **trainer_args) 541 log_every_n_steps = min(log_every_n_steps, len(self.x) // self._batch_size) 542 trainer = pl.Trainer( 543 max_epochs=max_epochs, 544 callbacks=[esc] + (callbacks or []), (...) 548 **trainer_args, 549 ) --> 551 trainer.fit(self) 553 self._is_fitted = True 554 log.info("Successfully ended training.") File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:608, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 606 model = self._maybe_unwrap_optimized(model) 607 self.strategy._lightning_module = model --> 608 call._call_and_handle_interrupt( 609 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path 610 ) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:38, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs) 36 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) 37 else: ---> 38 return trainer_fn(*args, **kwargs) 40 except _TunerExitException: 41 trainer._call_teardown_hook() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:650, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 643 ckpt_path = ckpt_path or self.resume_from_checkpoint 644 self._ckpt_path = self._checkpoint_connector._set_ckpt_path( 645 self.state.fn, 646 ckpt_path, # type: ignore[arg-type] 647 model_provided=True, 648 model_connected=self.lightning_module is not None, 649 ) --> 650 self._run(model, ckpt_path=self.ckpt_path) 652 assert self.state.stopped 653 self.training = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1112, in Trainer._run(self, model, ckpt_path) 1108 self._checkpoint_connector.restore_training_state() 1110 self._checkpoint_connector.resume_end() -> 1112 results = self._run_stage() 1114 log.detail(f"{self.__class__.__name__}: trainer tearing down") 1115 self._teardown() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1191, in Trainer._run_stage(self) 1189 if self.predicting: 1190 return self._run_predict() -> 1191 self._run_train() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1214, in Trainer._run_train(self) 1211 self.fit_loop.trainer = self 1213 with torch.autograd.set_detect_anomaly(self._detect_anomaly): -> 1214 self.fit_loop.run() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:267, in FitLoop.advance(self) 265 self._data_fetcher.setup(dataloader, batch_to_device=batch_to_device) 266 with self.trainer.profiler.profile("run_training_epoch"): --> 267 self._outputs = self.epoch_loop.run(self._data_fetcher) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py:213, in TrainingEpochLoop.advance(self, data_fetcher) 210 self.batch_progress.increment_started() 212 with self.trainer.profiler.profile("run_training_batch"): --> 213 batch_output = self.batch_loop.run(kwargs) 215 self.batch_progress.increment_processed() 217 # update non-plateau LR schedulers 218 # update epoch-interval ones only when we are at the end of training epoch File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py:88, in TrainingBatchLoop.advance(self, kwargs) 84 if self.trainer.lightning_module.automatic_optimization: 85 optimizers = _get_active_optimizers( 86 self.trainer.optimizers, self.trainer.optimizer_frequencies, kwargs.get("batch_idx", 0) 87 ) ---> 88 outputs = self.optimizer_loop.run(optimizers, kwargs) 89 else: 90 outputs = self.manual_loop.run(kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:202, in OptimizerLoop.advance(self, optimizers, kwargs) 199 def advance(self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict) -> None: 200 kwargs = self._build_kwargs(kwargs, self.optimizer_idx, self._hiddens) --> 202 result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) 203 if result.loss is not None: 204 # automatic optimization assumes a loss needs to be returned for extras to be considered as the batch 205 # would be skipped otherwise 206 self._outputs[self.optimizer_idx] = result.asdict() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:249, in OptimizerLoop._run_optimization(self, kwargs, optimizer) 241 closure() 243 # ------------------------------ 244 # BACKWARD PASS 245 # ------------------------------ 246 # gradient update with accumulated gradients 247 else: 248 # the `batch_idx` is optional with inter-batch parallelism --> 249 self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) 251 result = closure.consume_result() 253 if result.loss is not None: 254 # if no result, user decided to skip optimization 255 # otherwise update running loss + reset accumulated loss 256 # TODO: find proper way to handle updating running loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:370, in OptimizerLoop._optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure) 362 rank_zero_deprecation( 363 "The NVIDIA/apex AMP implementation has been deprecated upstream. Consequently, its integration inside" 364 " PyTorch Lightning has been deprecated in v1.9.0 and will be removed in v2.0.0." (...) 367 " return True." 368 ) 369 kwargs["using_native_amp"] = isinstance(self.trainer.precision_plugin, MixedPrecisionPlugin) --> 370 self.trainer._call_lightning_module_hook( 371 "optimizer_step", 372 self.trainer.current_epoch, 373 batch_idx, 374 optimizer, 375 opt_idx, 376 train_step_and_backward_closure, 377 on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator), 378 **kwargs, # type: ignore[arg-type] 379 using_lbfgs=is_lbfgs, 380 ) 382 if not should_accumulate: 383 self.optim_progress.optimizer.step.increment_completed() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1356, in Trainer._call_lightning_module_hook(self, hook_name, pl_module, *args, **kwargs) 1353 pl_module._current_fx_name = hook_name 1355 with self.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"): -> 1356 output = fn(*args, **kwargs) 1358 # restore current_fx when nested context 1359 pl_module._current_fx_name = prev_fx_name File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/module.py:1754, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_lbfgs) 1675 def optimizer_step( 1676 self, 1677 epoch: int, (...) 1683 using_lbfgs: bool = False, 1684 ) -> None: 1685 r""" 1686 Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls 1687 each optimizer. (...) 1752 1753 """ -> 1754 optimizer.step(closure=optimizer_closure) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py:169, in LightningOptimizer.step(self, closure, **kwargs) 166 raise MisconfigurationException("When `optimizer.step(closure)` is called, the closure should be callable") 168 assert self._strategy is not None --> 169 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs) 171 self._on_after_step() 173 return step_output File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:234, in Strategy.optimizer_step(self, optimizer, opt_idx, closure, model, **kwargs) 232 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed 233 assert isinstance(model, pl.LightningModule) --> 234 return self.precision_plugin.optimizer_step( 235 optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs 236 ) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:119, in PrecisionPlugin.optimizer_step(self, optimizer, model, optimizer_idx, closure, **kwargs) 117 """Hook to run the optimizer step.""" 118 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure) --> 119 return optimizer.step(closure=closure, **kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:140, in Optimizer._hook_for_profile..profile_hook_step..wrapper(*args, **kwargs) 138 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__) 139 with torch.autograd.profiler.record_function(profile_name): --> 140 out = func(*args, **kwargs) 141 obj._optimizer_step_code() 142 return out File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:23, in _use_grad_for_differentiable.._use_grad(self, *args, **kwargs) 21 try: 22 torch.set_grad_enabled(self.defaults['differentiable']) ---> 23 ret = func(self, *args, **kwargs) 24 finally: 25 torch.set_grad_enabled(prev_grad) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/adam.py:183, in Adam.step(self, closure, grad_scaler) 181 if closure is not None: 182 with torch.enable_grad(): --> 183 loss = closure() 185 for group in self.param_groups: 186 params_with_grad = [] File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:105, in PrecisionPlugin._wrap_closure(self, model, optimizer, optimizer_idx, closure) 92 def _wrap_closure( 93 self, 94 model: "pl.LightningModule", (...) 97 closure: Callable[[], Any], 98 ) -> Any: 99 """This double-closure allows makes sure the ``closure`` is executed before the 100 ``on_before_optimizer_step`` hook is called. 101 102 The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is 103 consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly. 104 """ --> 105 closure_result = closure() 106 self._after_closure(model, optimizer, optimizer_idx) 107 return closure_result File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:149, in Closure.__call__(self, *args, **kwargs) 148 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: --> 149 self._result = self.closure(*args, **kwargs) 150 return self._result.loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:135, in Closure.closure(self, *args, **kwargs) 134 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: --> 135 step_output = self._step_fn() 137 if step_output.closure_loss is None: 138 self.warning_cache.warn("`training_step` returned `None`. If this was on purpose, ignore this warning...") File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:419, in OptimizerLoop._training_step(self, kwargs) 410 """Performs the actual train step with the tied hooks. 411 412 Args: (...) 416 A ``ClosureResult`` containing the training step output. 417 """ 418 # manually capture logged metrics --> 419 training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values()) 420 self.trainer.strategy.post_training_step() 422 model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1494, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs) 1491 return 1493 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"): -> 1494 output = fn(*args, **kwargs) 1496 # restore current_fx when nested context 1497 pl_module._current_fx_name = prev_fx_name File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:378, in Strategy.training_step(self, *args, **kwargs) 376 with self.precision_plugin.train_step_context(): 377 assert isinstance(self.model, TrainingStep) --> 378 return self.model.training_step(*args, **kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:306, in Scyan.training_step(self, data, _) 304 """PyTorch lightning `training_step` implementation (i.e. returning the loss). See [ScyanModule][scyan.module.ScyanModule] for more details.""" 305 use_temp = self.current_epoch % self.hparams.modulo_temp > 0 --> 306 loss = self.module.kl(*data, use_temp) 308 self.log("loss", loss, on_epoch=True, on_step=True) 310 return loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:194, in ScyanModule.kl(self, x, covariates, use_temp) 178 def kl( 179 self, 180 x: Tensor, 181 covariates: Tensor, 182 use_temp: bool, 183 ) -> Tuple[Tensor, Tensor]: 184 """Compute the module loss for one mini-batch. 185 186 Args: (...) 192 The KL loss term. 193 """ --> 194 log_probs, ldj_sum, _ = self.compute_probabilities(x, covariates, use_temp) 196 return -(torch.logsumexp(log_probs, dim=1) + ldj_sum).mean() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:174, in ScyanModule.compute_probabilities(self, x, covariates, use_temp) 166 u, _, ldj_sum = self(x, covariates) 168 log_pi = ( 169 self.log_pi_temperature(-self.hparams.temperature) 170 if use_temp 171 else self.log_pi 172 ) --> 174 log_probs = self.prior.log_prob(u) + log_pi # size N x P 176 return log_probs, ldj_sum, u File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:124, in PriorDistribution.log_prob(self, u) 114 """Log probability per population. 115 116 Args: (...) 120 Log probabilities tensor of size $(B, P)$. 121 """ 122 diff = self.difference_to_modes(u) # size B x P x M --> 124 return self.prior_h.log_prob(diff) + self.na_constant_term File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/distributions/multivariate_normal.py:216, in MultivariateNormal.log_prob(self, value) 214 self._validate_sample(value) 215 diff = value - self.loc --> 216 M = _batch_mahalanobis(self._unbroadcasted_scale_tril, diff) 217 half_log_det = self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) 218 return -0.5 * (self._event_shape[0] * math.log(2 * math.pi) + M) - half_log_det File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/distributions/multivariate_normal.py:59, in _batch_mahalanobis(bL, bx) 57 flat_x = bx.reshape(-1, flat_L.size(0), n) # shape = c x b x n 58 flat_x_swap = flat_x.permute(1, 2, 0) # shape = b x n x c ---> 59 M_swap = torch.linalg.solve_triangular(flat_L, flat_x_swap, upper=False).pow(2).sum(-2) # shape = b x c 60 M = M_swap.t() # shape = c x b 62 # Now we revert the above reshape and permute operators. RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument B in method wrapper__linalg_solve_triangular) ```

quentinblampey commented 1 year ago

Okay thanks. Does it run a few epochs or does it crash immediately? For now I can't reproduce the error, but I will try something else this week

baj12 commented 1 year ago

Okay thanks. Does it run a few epochs or does it crash immediately? For now I can't reproduce the error, but I will try something else this week

How can I say if it runs for a few epochs? It takes a few seconds for the error message to appear. But you have all the output I have... B

quentinblampey commented 1 year ago

Because in your log it says "Ended warm up epochs", which only appears after a few epochs. So it seems that it can run, but it fails at some point. I'm surprised that the GPU support works for a few epochs but not for the full training. I will really need to reproduce this issue to be able to fix it, maybe it's due to your python version 3.9.18 (because everything else is similar on my test), I'll try this

baj12 commented 1 year ago

Which version are you using. I am eager testing on my side as well.

On Nov 9, 2023 10:09, Quentin Blampey @.***> wrote:

Because in your log it says "Ended warm up epochs", which only appears after a few epochs. So it seems that it can run, but it fails at some point. I'm surprised that the GPU support works for a few epochs but not for the full training. I will really need to reproduce this issue to be able to fix it, maybe it's due to your python version 3.9.18 (because everything else is similar on my test), I'll try this

— Reply to this email directly, view it on GitHubhttps://urldefense.com/v3/__https://github.com/MICS-Lab/scyan/issues/23*issuecomment-1803536346__;Iw!!JFdNOqOXpB6UZW0!oBglEsv7uosaujMQl-67oZi4y_qLSfDjRROGjVjTn8UEiMuHcdfR0MMtN66BvTHWnwWTKbMn7bmrHdVJuwTSEPVBRBcJvg$, or unsubscribehttps://urldefense.com/v3/__https://github.com/notifications/unsubscribe-auth/AALI3E4HYTQ6JEMN3772QDTYDSTVPAVCNFSM6AAAAAA67EOQO6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQMBTGUZTMMZUGY__;!!JFdNOqOXpB6UZW0!oBglEsv7uosaujMQl-67oZi4y_qLSfDjRROGjVjTn8UEiMuHcdfR0MMtN66BvTHWnwWTKbMn7bmrHdVJuwTSEPWzYaYwPw$. You are receiving this because you were mentioned.Message ID: @.***>

quentinblampey commented 1 year ago

I'm using version python 3.10.13 EDIT: I have reproduced the error, I'll work on this

baj12 commented 12 months ago

Ok, thanks! I have tested with 3.10.7 and got a bit further but with the same result. 3.10.13 breaks even earlier than 3.10.7... B

quentinblampey commented 12 months ago

Hello @baj12, the issue may be solved now

I needed to add a .to(self.device) at a very specific location of the code, which is weird because Pytorch Lightning is supposed to handle it. Anyway, I pushed the changes on the master branch but didn't release a new version yet. Can you pull the latest changes from the master branch (see "pip installation in dev mode" from the readme) and let me know if it works for you?

baj12 commented 11 months ago

Not sure, I am using the updated version... Here is what I did: git clone pip3.9 install -e .

Then, using RStudio/jupyter on a GPU cluster

import scyan
print(scyan.__version__)

=> 1.5.2

and after a while I still get the CPU error, see below

Error message

I am requesting 2 GPUs and 8GB of memory. ``` import scyan import torch #import lightning.pytorch import pytorch_lightning as pl import torch from torch import Tensor, distributions from pathlib import Path import pandas as pd import anndata import numpy as np print(scyan.__version__) => 1.5.2 adata, table = scyan.data.load("aml") # Automatic loading model = scyan.Scyan(adata, table) ``` ``` /pasteur/appa/homes/bernd/venvs/P3.9.18.scyan/lib/python3.9/site-packages/anndata/_core/anndata.py:1118: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if not is_categorical_dtype(df_full[k]): [INFO] (scyan.model) Initialized Scyan model with N=104184 cells, P=14 populations and M=14 markers. ├── No covariate provided ├── No continuum-marker provided └── Batch correction mode: False ``` ``` model.fit(accelerator='gpu', devices=1) ``` ``` [INFO] (scyan.model) Training scyan with the following hyperparameters: "batch_key": None "batch_size": 8192 "hidden_size": 16 "lr": 0.0005 "max_samples": 200000 "modulo_temp": 3 "n_hidden_layers": 6 "n_layers": 7 "prior_std": 0.3 "temperature": 0.5 "warm_up": (0.35, 4) /pasteur/appa/homes/bernd/venvs/P3.9.18.scyan/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:165: PossibleUserWarning: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /pasteur/appa/homes/bernd/venvs/P3.9.18.scyan/lib/py ... rank_zero_warn( GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] | Name | Type | Params --------------------------------------- 0 | module | ScyanModule | 29.6 K --------------------------------------- 29.6 K Trainable params 0 Non-trainable params 29.6 K Total params 0.118 Total estimated model params size (MB) Ended warm up epochs --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[4], line 1 ----> 1 model.fit(accelerator='gpu', devices=1) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:551, in Scyan.fit(self, max_epochs, min_delta, patience, num_workers, log_every_n_steps, callbacks, logger, enable_checkpointing, trainer, **trainer_args) 541 log_every_n_steps = min(log_every_n_steps, len(self.x) // self._batch_size) 542 trainer = pl.Trainer( 543 max_epochs=max_epochs, 544 callbacks=[esc] + (callbacks or []), (...) 548 **trainer_args, 549 ) --> 551 trainer.fit(self) 553 self._is_fitted = True 554 log.info("Successfully ended training.") File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:608, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 606 model = self._maybe_unwrap_optimized(model) 607 self.strategy._lightning_module = model --> 608 call._call_and_handle_interrupt( 609 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path 610 ) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:38, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs) 36 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) 37 else: ---> 38 return trainer_fn(*args, **kwargs) 40 except _TunerExitException: 41 trainer._call_teardown_hook() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:650, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 643 ckpt_path = ckpt_path or self.resume_from_checkpoint 644 self._ckpt_path = self._checkpoint_connector._set_ckpt_path( 645 self.state.fn, 646 ckpt_path, # type: ignore[arg-type] 647 model_provided=True, 648 model_connected=self.lightning_module is not None, 649 ) --> 650 self._run(model, ckpt_path=self.ckpt_path) 652 assert self.state.stopped 653 self.training = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1112, in Trainer._run(self, model, ckpt_path) 1108 self._checkpoint_connector.restore_training_state() 1110 self._checkpoint_connector.resume_end() -> 1112 results = self._run_stage() 1114 log.detail(f"{self.__class__.__name__}: trainer tearing down") 1115 self._teardown() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1191, in Trainer._run_stage(self) 1189 if self.predicting: 1190 return self._run_predict() -> 1191 self._run_train() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1214, in Trainer._run_train(self) 1211 self.fit_loop.trainer = self 1213 with torch.autograd.set_detect_anomaly(self._detect_anomaly): -> 1214 self.fit_loop.run() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:267, in FitLoop.advance(self) 265 self._data_fetcher.setup(dataloader, batch_to_device=batch_to_device) 266 with self.trainer.profiler.profile("run_training_epoch"): --> 267 self._outputs = self.epoch_loop.run(self._data_fetcher) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py:213, in TrainingEpochLoop.advance(self, data_fetcher) 210 self.batch_progress.increment_started() 212 with self.trainer.profiler.profile("run_training_batch"): --> 213 batch_output = self.batch_loop.run(kwargs) 215 self.batch_progress.increment_processed() 217 # update non-plateau LR schedulers 218 # update epoch-interval ones only when we are at the end of training epoch File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py:88, in TrainingBatchLoop.advance(self, kwargs) 84 if self.trainer.lightning_module.automatic_optimization: 85 optimizers = _get_active_optimizers( 86 self.trainer.optimizers, self.trainer.optimizer_frequencies, kwargs.get("batch_idx", 0) 87 ) ---> 88 outputs = self.optimizer_loop.run(optimizers, kwargs) 89 else: 90 outputs = self.manual_loop.run(kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs) 197 try: 198 self.on_advance_start(*args, **kwargs) --> 199 self.advance(*args, **kwargs) 200 self.on_advance_end() 201 self._restarting = False File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:202, in OptimizerLoop.advance(self, optimizers, kwargs) 199 def advance(self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict) -> None: 200 kwargs = self._build_kwargs(kwargs, self.optimizer_idx, self._hiddens) --> 202 result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) 203 if result.loss is not None: 204 # automatic optimization assumes a loss needs to be returned for extras to be considered as the batch 205 # would be skipped otherwise 206 self._outputs[self.optimizer_idx] = result.asdict() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:249, in OptimizerLoop._run_optimization(self, kwargs, optimizer) 241 closure() 243 # ------------------------------ 244 # BACKWARD PASS 245 # ------------------------------ 246 # gradient update with accumulated gradients 247 else: 248 # the `batch_idx` is optional with inter-batch parallelism --> 249 self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) 251 result = closure.consume_result() 253 if result.loss is not None: 254 # if no result, user decided to skip optimization 255 # otherwise update running loss + reset accumulated loss 256 # TODO: find proper way to handle updating running loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:370, in OptimizerLoop._optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure) 362 rank_zero_deprecation( 363 "The NVIDIA/apex AMP implementation has been deprecated upstream. Consequently, its integration inside" 364 " PyTorch Lightning has been deprecated in v1.9.0 and will be removed in v2.0.0." (...) 367 " return True." 368 ) 369 kwargs["using_native_amp"] = isinstance(self.trainer.precision_plugin, MixedPrecisionPlugin) --> 370 self.trainer._call_lightning_module_hook( 371 "optimizer_step", 372 self.trainer.current_epoch, 373 batch_idx, 374 optimizer, 375 opt_idx, 376 train_step_and_backward_closure, 377 on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator), 378 **kwargs, # type: ignore[arg-type] 379 using_lbfgs=is_lbfgs, 380 ) 382 if not should_accumulate: 383 self.optim_progress.optimizer.step.increment_completed() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1356, in Trainer._call_lightning_module_hook(self, hook_name, pl_module, *args, **kwargs) 1353 pl_module._current_fx_name = hook_name 1355 with self.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"): -> 1356 output = fn(*args, **kwargs) 1358 # restore current_fx when nested context 1359 pl_module._current_fx_name = prev_fx_name File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/module.py:1754, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_lbfgs) 1675 def optimizer_step( 1676 self, 1677 epoch: int, (...) 1683 using_lbfgs: bool = False, 1684 ) -> None: 1685 r""" 1686 Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls 1687 each optimizer. (...) 1752 1753 """ -> 1754 optimizer.step(closure=optimizer_closure) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py:169, in LightningOptimizer.step(self, closure, **kwargs) 166 raise MisconfigurationException("When `optimizer.step(closure)` is called, the closure should be callable") 168 assert self._strategy is not None --> 169 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs) 171 self._on_after_step() 173 return step_output File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:234, in Strategy.optimizer_step(self, optimizer, opt_idx, closure, model, **kwargs) 232 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed 233 assert isinstance(model, pl.LightningModule) --> 234 return self.precision_plugin.optimizer_step( 235 optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs 236 ) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:119, in PrecisionPlugin.optimizer_step(self, optimizer, model, optimizer_idx, closure, **kwargs) 117 """Hook to run the optimizer step.""" 118 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure) --> 119 return optimizer.step(closure=closure, **kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:140, in Optimizer._hook_for_profile..profile_hook_step..wrapper(*args, **kwargs) 138 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__) 139 with torch.autograd.profiler.record_function(profile_name): --> 140 out = func(*args, **kwargs) 141 obj._optimizer_step_code() 142 return out File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:23, in _use_grad_for_differentiable.._use_grad(self, *args, **kwargs) 21 try: 22 torch.set_grad_enabled(self.defaults['differentiable']) ---> 23 ret = func(self, *args, **kwargs) 24 finally: 25 torch.set_grad_enabled(prev_grad) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/adam.py:183, in Adam.step(self, closure, grad_scaler) 181 if closure is not None: 182 with torch.enable_grad(): --> 183 loss = closure() 185 for group in self.param_groups: 186 params_with_grad = [] File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:105, in PrecisionPlugin._wrap_closure(self, model, optimizer, optimizer_idx, closure) 92 def _wrap_closure( 93 self, 94 model: "pl.LightningModule", (...) 97 closure: Callable[[], Any], 98 ) -> Any: 99 """This double-closure allows makes sure the ``closure`` is executed before the 100 ``on_before_optimizer_step`` hook is called. 101 102 The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is 103 consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly. 104 """ --> 105 closure_result = closure() 106 self._after_closure(model, optimizer, optimizer_idx) 107 return closure_result File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:149, in Closure.__call__(self, *args, **kwargs) 148 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: --> 149 self._result = self.closure(*args, **kwargs) 150 return self._result.loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:135, in Closure.closure(self, *args, **kwargs) 134 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: --> 135 step_output = self._step_fn() 137 if step_output.closure_loss is None: 138 self.warning_cache.warn("`training_step` returned `None`. If this was on purpose, ignore this warning...") File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:419, in OptimizerLoop._training_step(self, kwargs) 410 """Performs the actual train step with the tied hooks. 411 412 Args: (...) 416 A ``ClosureResult`` containing the training step output. 417 """ 418 # manually capture logged metrics --> 419 training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values()) 420 self.trainer.strategy.post_training_step() 422 model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1494, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs) 1491 return 1493 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"): -> 1494 output = fn(*args, **kwargs) 1496 # restore current_fx when nested context 1497 pl_module._current_fx_name = prev_fx_name File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:378, in Strategy.training_step(self, *args, **kwargs) 376 with self.precision_plugin.train_step_context(): 377 assert isinstance(self.model, TrainingStep) --> 378 return self.model.training_step(*args, **kwargs) File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:306, in Scyan.training_step(self, data, _) 304 """PyTorch lightning `training_step` implementation (i.e. returning the loss). See [ScyanModule][scyan.module.ScyanModule] for more details.""" 305 use_temp = self.current_epoch % self.hparams.modulo_temp > 0 --> 306 loss = self.module.kl(*data, use_temp) 308 self.log("loss", loss, on_epoch=True, on_step=True) 310 return loss File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:194, in ScyanModule.kl(self, x, covariates, use_temp) 178 def kl( 179 self, 180 x: Tensor, 181 covariates: Tensor, 182 use_temp: bool, 183 ) -> Tuple[Tensor, Tensor]: 184 """Compute the module loss for one mini-batch. 185 186 Args: (...) 192 The KL loss term. 193 """ --> 194 log_probs, ldj_sum, _ = self.compute_probabilities(x, covariates, use_temp) 196 return -(torch.logsumexp(log_probs, dim=1) + ldj_sum).mean() File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:174, in ScyanModule.compute_probabilities(self, x, covariates, use_temp) 166 u, _, ldj_sum = self(x, covariates) 168 log_pi = ( 169 self.log_pi_temperature(-self.hparams.temperature) 170 if use_temp 171 else self.log_pi 172 ) --> 174 log_probs = self.prior.log_prob(u) + log_pi # size N x P 176 return log_probs, ldj_sum, u File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:124, in PriorDistribution.log_prob(self, u) 114 """Log probability per population. 115 116 Args: (...) 120 Log probabilities tensor of size $(B, P)$. 121 """ 122 diff = self.difference_to_modes(u) # size B x P x M --> 124 return self.prior_h.log_prob(diff) + self.na_constant_term File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/distributions/multivariate_normal.py:216, in MultivariateNormal.log_prob(self, value) 214 self._validate_sample(value) 215 diff = value - self.loc --> 216 M = _batch_mahalanobis(self._unbroadcasted_scale_tril, diff) 217 half_log_det = self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) 218 return -0.5 * (self._event_shape[0] * math.log(2 * math.pi) + M) - half_log_det File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/distributions/multivariate_normal.py:59, in _batch_mahalanobis(bL, bx) 57 flat_x = bx.reshape(-1, flat_L.size(0), n) # shape = c x b x n 58 flat_x_swap = flat_x.permute(1, 2, 0) # shape = b x n x c ---> 59 M_swap = torch.linalg.solve_triangular(flat_L, flat_x_swap, upper=False).pow(2).sum(-2) # shape = b x c 60 M = M_swap.t() # shape = c x b 62 # Now we revert the above reshape and permute operators. RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument B in method wrapper__linalg_solve_triangular) ```

quentinblampey commented 11 months ago

No it seems you don't have the new updates. Have you recently cloned the repository or did you do it a long time ago? If you did it before my change, then run git pull to get the latest changes

In particular, you should see self.register_buffer("cov", cov.to(self.device)) at line 42 of the file scyan/module/distribution.py

If you can't get these updates, I will release a new version (but I try to avoid this, just in case the issue wasn't fixed)

baj12 commented 11 months ago

Yes, it is working. Thanks. Somehow I was working in the wrong venv...

quentinblampey commented 11 months ago

Great, sounds good, I will make a 1.5.3 release with the fix!

MICS-Lab / scyan

GPU support #23

Description

Reproducing the issue

System