sktime / pytorch-forecasting

Time series forecasting with PyTorch
https://pytorch-forecasting.readthedocs.io/
MIT License
3.95k stars 626 forks source link

"RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn" when using my loss-function #689

Open TimothyLiuu opened 3 years ago

TimothyLiuu commented 3 years ago

Expected behavior

Hello!Appreciate for your brilliant work! When I using the Temporal_Fusion_Transformer I refer to the class "QuantileLoss(MultiHorizonMetric)" and modify the loss function to expect the model prediction results to be more accurate.

Actual behavior

tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # 7 quantiles by default
    loss=MyLoss(),
    log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=4,
)

However, the error is: RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Code to reproduce the problem

The define of MyLoss() is:

class MyLoss(MultiHorizonMetric):
    def __init__(
            self,
            quantiles: List[float] = [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98],
            **kwargs,
    ):
        super().__init__(quantiles=quantiles, **kwargs)

    def loss(self, y_pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        # calculate quantile loss
        diff_2 = torch.zeros_like(target)
        for n in range(target.size(0)):
            for m in range(target.size(1)):
                if m == 0 or m == (target.size(1) - 1):
                    diff_2[n][m] = 0
                else:
                    diff_2[n][m] = torch.abs(target[n][m - 1] - 2 * target[n][m] + target[n][m + 1])

        losses = []
        for i, q in enumerate(self.quantiles):
            mae = torch.abs(y_pred[..., i] - target) / target.size(1)
            rmse = torch.sqrt(torch.pow(y_pred[..., i] - target, 2)) / target.size(1)

            loss = q * rmse + (1 - q) * mae + 0.2 * torch.pow(diff_2, 2)
            # loss = q * rmse + (1 - q) * mae

            losses.append(loss.unsqueeze(-1))

        losses = torch.cat(losses, dim=2)
        return losses

    def to_prediction(self, y_pred: torch.Tensor) -> torch.Tensor:
        """
        Convert network prediction into a point prediction.

        Args:
            y_pred: prediction output of network

        Returns:
            torch.Tensor: point prediction
        """
        if y_pred.ndim == 3:
            idx = self.quantiles.index(0.5)
            y_pred = y_pred[..., idx]
        return y_pred

    def to_quantiles(self, y_pred: torch.Tensor) -> torch.Tensor:
        """
        Convert network prediction into a quantile prediction.

        Args:
            y_pred: prediction output of network

        Returns:
            torch.Tensor: prediction quantiles
        """
        return y_pred

And the error is:

 File "F:/TimothyLiu/ICONIP 2021/TFI/trian_TFT.py", line 197, in <module>
    val_dataloaders=val_dataloader,
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 458, in fit
    self._run(model)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 756, in _run
    self.dispatch()
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 797, in dispatch
    self.accelerator.start_training(self)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\accelerators\accelerator.py", line 96, in start_training
    self.training_type_plugin.start_training(trainer)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\plugins\training_type\training_type_plugin.py", line 144, in start_training
    self._results = trainer.run_stage()
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 807, in run_stage
    return self.run_train()
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 869, in run_train
    self.train_loop.run_training_epoch()
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\training_loop.py", line 499, in run_training_epoch
    batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\training_loop.py", line 738, in run_training_batch
    self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\training_loop.py", line 442, in optimizer_step
    using_lbfgs=is_lbfgs,
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\core\lightning.py", line 1403, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\core\optimizer.py", line 214, in step
    self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\core\optimizer.py", line 134, in __optimizer_step
    trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\accelerators\accelerator.py", line 329, in optimizer_step
    self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\accelerators\accelerator.py", line 336, in run_optimizer_step
    self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\plugins\training_type\training_type_plugin.py", line 193, in optimizer_step
    optimizer.step(closure=lambda_closure, **kwargs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\torch\optim\optimizer.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_forecasting\optim.py", line 131, in step
    _ = closure()
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\training_loop.py", line 733, in train_step_and_backward_closure
    split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\training_loop.py", line 836, in training_step_and_backward
    self.backward(result, optimizer, opt_idx)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\trainer\training_loop.py", line 870, in backward
    result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\accelerators\accelerator.py", line 309, in backward
    self.lightning_module, closure_loss, optimizer, optimizer_idx, should_accumulate, *args, **kwargs
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\plugins\precision\precision_plugin.py", line 79, in backward
    model.backward(closure_loss, optimizer, opt_idx)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\pytorch_lightning\core\lightning.py", line 1275, in backward
    loss.backward(*args, **kwargs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\torch\_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "D:\DeepLearning\Anaconda3\envs\pytorch-transformer\lib\site-packages\torch\autograd\__init__.py", line 149, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Joel-OPC commented 3 years ago

I got the same error but with the default QuantileLoss.

Tested on:

Thank you for your help!

russellbrooks commented 3 years ago

Are you using multiple GPUs? For what it's worth – I'm also hitting this error, but only when using multiple GPUs and multiple targets.

I've ensured there's no nulls in my dataset, values are normalized, low learning rate with clipped gradients to reduce instability. Here's what I'm noticing:

Single target    + CPU           --> works
Multiple targets + CPU           --> works
Single target    + 1 GPU         --> works
Multiple targets + 1 GPU         --> works
Single target    + multiple GPUs --> works
Multiple targets + multiple GPUs --> broken
TimothyLiuu commented 3 years ago

Are you using multiple GPUs? For what it's worth – I'm also hitting this error, but only when using multiple GPUs and multiple targets.

I've ensured there's no nulls in my dataset, values are normalized, low learning rate with clipped gradients to reduce instability. Here's what I'm noticing:

Single target    + CPU           --> works
Multiple targets + CPU           --> works
Single target    + 1 GPU         --> works
Multiple targets + 1 GPU         --> works
Single target    + multiple GPUs --> works
Multiple targets + multiple GPUs --> broken

I use "Single target + 1 GPU ", but it ends with the error.

LumingSun commented 2 years ago

Are you using multiple GPUs? For what it's worth – I'm also hitting this error, but only when using multiple GPUs and multiple targets.

I've ensured there's no nulls in my dataset, values are normalized, low learning rate with clipped gradients to reduce instability. Here's what I'm noticing:

Single target    + CPU           --> works
Multiple targets + CPU           --> works
Single target    + 1 GPU         --> works
Multiple targets + 1 GPU         --> works
Single target    + multiple GPUs --> works
Multiple targets + multiple GPUs --> broken

Same problem. Any progress?

nicocheh commented 2 years ago

I am having the same issue with Single target + multiple GPUs with TFT with QuantileLoss(). Any news on this? @jdb78 thank you for this awesome project!

processadd commented 2 years ago

@jdb78, I saw several cases in issues but not try out a solution. I use 1GPU, 1 target and got the same error. sorry I cannot provide a colab, the code is like:

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="usage",
    group_ids=["A", "B", "C", "D"],

    min_encoder_length=max_encoder_length // 2,  
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["A", "B", "C", "D"],
    time_varying_known_categoricals=["day"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "usage",
        "log_usage",
        "avg_usage_by_A",
        "avg_usage_by_B",
        "avg_usage_by_C",
    ],
   allow_missing_timesteps=True,
   categorical_encoders={'A': NaNLabelEncoder(add_nan=True), 
                          'B': NaNLabelEncoder(add_nan=True), 
                          'C': NaNLabelEncoder(add_nan=True), 
                          'D': NaNLabelEncoder(add_nan=True), 
                          }
  ... ...
# throws RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
res = trainer.tuner.lr_find(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=1,
    min_lr=1e-6,
)
nicocheh commented 2 years ago

@jdb78, I saw several cases in issues but not try out a solution. I use 1GPU, 1 target and got the same error. sorry I cannot provide a colab, the code is like:

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="usage",
    group_ids=["A", "B", "C", "D"],

    min_encoder_length=max_encoder_length // 2,  
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["A", "B", "C", "D"],
    time_varying_known_categoricals=["day"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "usage",
        "log_usage",
        "avg_usage_by_A",
        "avg_usage_by_B",
        "avg_usage_by_C",
    ],
   allow_missing_timesteps=True,
   categorical_encoders={'A': NaNLabelEncoder(add_nan=True), 
                          'B': NaNLabelEncoder(add_nan=True), 
                          'C': NaNLabelEncoder(add_nan=True), 
                          'D': NaNLabelEncoder(add_nan=True), 
                          }
  ... ...
# throws RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
res = trainer.tuner.lr_find(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=1,
    min_lr=1e-6,
)

I tried using min_pred_length=max_pred_length and downloading the version in master of pytorch ligthning and worked for me, maybe you can try that. Hope it helps

jdb78 commented 2 years ago

908

boydjc commented 1 year ago

@jdb78, I saw several cases in issues but not try out a solution. I use 1GPU, 1 target and got the same error. sorry I cannot provide a colab, the code is like:

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="usage",
    group_ids=["A", "B", "C", "D"],

    min_encoder_length=max_encoder_length // 2,  
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["A", "B", "C", "D"],
    time_varying_known_categoricals=["day"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "usage",
        "log_usage",
        "avg_usage_by_A",
        "avg_usage_by_B",
        "avg_usage_by_C",
    ],
   allow_missing_timesteps=True,
   categorical_encoders={'A': NaNLabelEncoder(add_nan=True), 
                          'B': NaNLabelEncoder(add_nan=True), 
                          'C': NaNLabelEncoder(add_nan=True), 
                          'D': NaNLabelEncoder(add_nan=True), 
                          }
  ... ...
# throws RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
res = trainer.tuner.lr_find(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=1,
    min_lr=1e-6,
)

My code kept having the error. I adjusted gradient_clip_val within the trainer. Now it seems to never have the issue.

My trainer before fixing the error


trainer = pl.Trainer(
        max_epochs=45,
        accelerator='mps', 
        devices=1,
        limit_train_batches=50, # coment in for training, running valiation every 30 batches
        #fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
        enable_model_summary=True,
        gradient_clip_val=0.1,
        callbacks=[lr_logger, early_stop_callback],
        logger=logger,
    )

My trainer after the error went away


trainer = pl.Trainer(
        max_epochs=45,
        accelerator='mps', 
        devices=1,
        limit_train_batches=50, # coment in for training, running valiation every 30 batches
        #fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
        enable_model_summary=True,
        gradient_clip_val=0.05,
        callbacks=[lr_logger, early_stop_callback],
        logger=logger,
    )
tobiaswandersleb commented 11 months ago

@TimothyLiuu were you able to solve the problem after all?

We are training TFT(temporal fusion transformer model) on 4 GPUs using AWS Sagemaker training jobs. Here an overview of what we have done so far:

  1. If we look at the error trace we have found that this error is happening inside following ERROR - An error occurred: DeadLock detected from rank: 2.
    hence we tried to train on single gpu but it didn’t resolve our problem and we had same error without deadlock exception
  2. Then after doing some finding, we found that this error normally occurs during backpass when gradient tensor is not available for the variable. Hence we tried setting required grad = True through context manager i.e torch.set_grad_enabled(True).
  3. Our last guess is it might be happening because of our loss tensor is having nonfinite values but we don’t know how can we avoid it from happening? We tried different values of gradient clipping but to no wail.

We are using pytorch_forecasting 0.10.3 and PyTorch_lightning 1.7.7.

Any feedback would be appreciated. :)

pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 2 Traceback (most recent call last): File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt return trainer_fn(*args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl results = self._run(model, ckpt_path=self.ckpt_path) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run results = self._run_stage() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage return self._run_train() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1283, in _run_train self.fit_loop.run() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 271, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance batch_output = self.batch_loop.run(kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance outputs = self.optimizer_loop.run(optimizers, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step self.trainer._call_lightning_module_hook( File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1550, in _call_lightning_module_hook output = fn(args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1705, in optimizer_step optimizer.step(closure=optimizer_closure) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 289, in optimizer_step optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 153, in optimizer_step return optimizer.step(closure=closure, kwargs) File "/usr/local/lib/python3.8/site-packages/torch/optim/optimizer.py", line 113, in wrapper return func(*args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorchforecasting/optim.py", line 143, in step = closure() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 138, in _wrap_closure closure_result = closure() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in call self._result = self.closure(*args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 141, in closure self._backward_fn(step_output.closure_loss) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 304, in backward_fn self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1704, in _call_strategy_hook output = fn(args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 191, in backward self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, optimizer_idx, *args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 80, in backward model.backward(closure_loss, optimizer, optimizer_idx, *args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1450, in backward loss.backward(args, kwargs) File "/usr/local/lib/python3.8/site-packages/torch/_tensor.py", line 396, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) File "/usr/local/lib/python3.8/site-packages/torch/autograd/init.py", line 173, in backward Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 2 Traceback (most recent call last): File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt return trainer_fn(*args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl results = self._run(model, ckpt_path=self.ckpt_path) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run results = self._run_stage() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage return self._run_train() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1283, in _run_train self.fit_loop.run() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 271, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance batch_output = self.batch_loop.run(kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance outputs = self.optimizer_loop.run(optimizers, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step self.trainer._call_lightning_module_hook( File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1550, in _call_lightning_module_hook output = fn(args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1705, in optimizer_step optimizer.step(closure=optimizer_closure) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 289, in optimizer_step optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 153, in optimizer_step return optimizer.step(closure=closure, kwargs) File "/usr/local/lib/python3.8/site-packages/torch/optim/optimizer.py", line 113, in wrapper return func(*args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorchforecasting/optim.py", line 143, in step = closure() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 138, in _wrap_closure closure_result = closure() File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in call self._result = self.closure(*args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 141, in closure self._backward_fn(step_output.closure_loss) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 304, in backward_fn self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1704, in _call_strategy_hook output = fn(args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 191, in backward self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, optimizer_idx, *args, kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 80, in backward model.backward(closure_loss, optimizer, optimizer_idx, *args, *kwargs) File "/usr/local/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1450, in backward loss.backward(args, kwargs) File "/usr/local/lib/python3.8/site-packages/torch/_tensor.py", line 396, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) File "/usr/local/lib/python3.8/site-packages/torch/autograd/init.py", line 173, in backward Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass

2023-11-22T16:33:55.181+01:00 RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

ari62 commented 3 days ago

I get this error when running on mps, not cpu. The y_pred passed to my loss function is all nans.