sktime / pytorch-forecasting

Time series forecasting with PyTorch
https://pytorch-forecasting.readthedocs.io/
MIT License
3.98k stars 631 forks source link

NaN when training DeepAR #750

Open y-yang42 opened 2 years ago

y-yang42 commented 2 years ago

Expected behavior

I try to run DeepAR model on my data set.

Actual behavior

Get ValueError and nan in RNN weights and output. A typical example is a time series that is all zero except a really large spike at a given point as shown below.

It seems to do with the max_encoder_length or max_prediction_length as changing such values will influence whether there is a error or not. Also using different target_normalizer will influence whether there is a error as well.

Code to reproduce the problem

import numpy as np
import pandas as pd
import random

import torch
import pytorch_lightning as pl
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import EncoderNormalizer
from pytorch_forecasting.models import DeepAR

import warnings
warnings.filterwarnings("ignore")

length = 100
np.random.seed(3)
data = np.zeros(length)
df = pd.DataFrame({'value':data, 'time_idx':range(length)})
df.iloc[70,0] = np.random.rand(1)*10**5
df['group']=0
print(df)

random.seed(0)
torch.manual_seed(0)
np.random.seed(0)

max_encoder_length = 37
max_prediction_length = 6
training_cutoff = df["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    df[lambda x: x.time_idx < training_cutoff],
    time_idx="time_idx",
    target="value",
    group_ids=["group"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["value"],
    time_varying_known_reals=["time_idx"],
)

validation = TimeSeriesDataSet.from_dataset(training, df, min_prediction_idx=training.index.time.max() + 1, stop_randomization=True)
batch_size = 4
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2)

net = DeepAR.from_dataset(
        training,
        learning_rate=0.001
    )
print(f"Number of parameters in network: {net.size()/1e3:.1f}k")

trainer = pl.Trainer(
    gpus=0, 
    gradient_clip_val=0.1, 
    max_epochs=5
)

trainer.fit(
    net,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
)

Traceback:

ValueError                                Traceback (most recent call last)
/tmp/ipykernel_25063/3740955615.py in <module>
----> 1 trainer.fit(
      2     net,
      3     train_dataloader=train_dataloader,
      4     val_dataloaders=val_dataloader,
      5 )

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, train_dataloader)
    550         self.checkpoint_connector.resume_start()
    551 
--> 552         self._run(model)
    553 
    554         assert self.state.stopped

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
    920 
    921         # dispatch `start_training` or `start_evaluating` or `start_predicting`
--> 922         self._dispatch()
    923 
    924         # plugin will finalized fitting (e.g. ddp_spawn will load trained model)

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in _dispatch(self)
    988             self.accelerator.start_predicting(self)
    989         else:
--> 990             self.accelerator.start_training(self)
    991 
    992     def run_stage(self):

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
     90 
     91     def start_training(self, trainer: "pl.Trainer") -> None:
---> 92         self.training_type_plugin.start_training(trainer)
     93 
     94     def start_evaluating(self, trainer: "pl.Trainer") -> None:

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
    159     def start_training(self, trainer: "pl.Trainer") -> None:
    160         # double dispatch to initiate the training loop
--> 161         self._results = trainer.run_stage()
    162 
    163     def start_evaluating(self, trainer: "pl.Trainer") -> None:

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
    998         if self.predicting:
    999             return self._run_predict()
-> 1000         return self._run_train()
   1001 
   1002     def _pre_training_routine(self):

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in _run_train(self)
   1047             # reset trainer on this loop and all child loops in case user connected a custom loop
   1048             self.fit_loop.trainer = self
-> 1049             self.fit_loop.run()
   1050         except KeyboardInterrupt:
   1051             rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
    109             try:
    110                 self.on_advance_start(*args, **kwargs)
--> 111                 self.advance(*args, **kwargs)
    112                 self.on_advance_end()
    113                 self.iteration_count += 1

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py in advance(self)
    198         with self.trainer.profiler.profile("run_training_epoch"):
    199             # run train epoch
--> 200             epoch_output = self.epoch_loop.run(train_dataloader)
    201 
    202             if epoch_output is None:

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
    109             try:
    110                 self.on_advance_start(*args, **kwargs)
--> 111                 self.advance(*args, **kwargs)
    112                 self.on_advance_end()
    113                 self.iteration_count += 1

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py in advance(self, dataloader_iter, **kwargs)
    128 
    129         with self.trainer.profiler.profile("run_training_batch"):
--> 130             batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
    131             self.batches_seen += 1
    132 

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in run(self, batch, batch_idx, dataloader_idx)
     98         self.trainer.fit_loop.epoch_loop.batch_progress.increment_started()
     99 
--> 100         super().run(batch, batch_idx, dataloader_idx)
    101         output = AttributeDict(signal=0, training_step_output=self.batch_outputs)
    102         self.batch_outputs = None  # free memory

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
    109             try:
    110                 self.on_advance_start(*args, **kwargs)
--> 111                 self.advance(*args, **kwargs)
    112                 self.on_advance_end()
    113                 self.iteration_count += 1

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in advance(self, batch, batch_idx, dataloader_idx)
    145                 self.optim_progress.optimizer_idx = opt_idx
    146 
--> 147                 result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
    148                 if result:
    149                     self.batch_outputs[opt_idx].append(deepcopy(result.training_step_output))

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _run_optimization(self, batch_idx, split_batch, opt_idx, optimizer)
    199         else:
    200             if self.trainer.lightning_module.automatic_optimization:
--> 201                 self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
    202             else:
    203                 result = self._training_step(split_batch, batch_idx, opt_idx, self._hiddens)

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    393 
    394         # model hook
--> 395         model_ref.optimizer_step(
    396             self.trainer.current_epoch,
    397             batch_idx,

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
   1614 
   1615         """
-> 1616         optimizer.step(closure=optimizer_closure)
   1617 
   1618     def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py in step(self, closure, **kwargs)
    204             profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
    205 
--> 206         self.__optimizer_step(closure=closure, profiler_name=profiler_name, **kwargs)
    207         self._total_optimizer_step_calls += 1
    208 

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py in __optimizer_step(self, closure, profiler_name, **kwargs)
    126 
    127         with trainer.profiler.profile(profiler_name):
--> 128             trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
    129 
    130     def step(self, closure: Optional[Callable] = None, **kwargs):

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in optimizer_step(self, optimizer, opt_idx, lambda_closure, **kwargs)
    294         )
    295         if make_optimizer_step:
--> 296             self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
    297         self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
    298         self.training_type_plugin.post_optimizer_step(optimizer, opt_idx, **kwargs)

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in run_optimizer_step(self, optimizer, optimizer_idx, lambda_closure, **kwargs)
    301         self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any
    302     ) -> None:
--> 303         self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
    304 
    305     def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in optimizer_step(self, optimizer, lambda_closure, **kwargs)
    224 
    225     def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
--> 226         optimizer.step(closure=lambda_closure, **kwargs)
    227 
    228     @property

~/.local/lib/python3.9/site-packages/torch/optim/optimizer.py in wrapper(*args, **kwargs)
     86                 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
     87                 with torch.autograd.profiler.record_function(profile_name):
---> 88                     return func(*args, **kwargs)
     89             return wrapper
     90 

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/optim.py in step(self, closure)
    129             closure: A closure that reevaluates the model and returns the loss.
    130         """
--> 131         _ = closure()
    132         loss = None
    133         # note - below is commented out b/c I have other work that passes back

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _training_step_and_backward_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens, return_result)
    233         """
    234 
--> 235         result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
    236         if result is not None:
    237             return_result.update(result)

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
    534         with self.trainer.profiler.profile("training_step_and_backward"):
    535             # lightning module hook
--> 536             result = self._training_step(split_batch, batch_idx, opt_idx, hiddens)
    537 
    538             if not self._skip_backward and self.trainer.lightning_module.automatic_optimization:

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _training_step(self, split_batch, batch_idx, opt_idx, hiddens)
    304             model_ref._current_fx_name = "training_step"
    305             with self.trainer.profiler.profile("training_step"):
--> 306                 training_step_output = self.trainer.accelerator.training_step(step_kwargs)
    307                 self.trainer.accelerator.post_training_step()
    308 

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in training_step(self, step_kwargs)
    191         """
    192         with self.precision_plugin.train_step_context(), self.training_type_plugin.train_step_context():
--> 193             return self.training_type_plugin.training_step(*step_kwargs.values())
    194 
    195     def post_training_step(self) -> None:

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in training_step(self, *args, **kwargs)
    170 
    171     def training_step(self, *args, **kwargs):
--> 172         return self.model.training_step(*args, **kwargs)
    173 
    174     def post_training_step(self):

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/models/base_model.py in training_step(self, batch, batch_idx)
    359         """
    360         x, y = batch
--> 361         log, out = self.step(x, y, batch_idx)
    362         log.update(self.create_log(x, y, out, batch_idx))
    363         return log

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/models/base_model.py in step(self, x, y, batch_idx, **kwargs)
    499                 )
    500             else:
--> 501                 loss = self.loss(prediction, y)
    502 
    503         self.log(f"{['val', 'train'][self.training]}_loss", loss, on_step=self.training, on_epoch=True, prog_bar=True)

~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.9/site-packages/torchmetrics/metric.py in forward(self, *args, **kwargs)
    195 
    196         with torch.no_grad():
--> 197             self.update(*args, **kwargs)
    198 
    199         if self.compute_on_step:

~/.local/lib/python3.9/site-packages/torchmetrics/metric.py in wrapped_func(*args, **kwargs)
    253             self._computed = None
    254             self._update_called = True
--> 255             return update(*args, **kwargs)
    256 
    257         return wrapped_func

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/metrics.py in update(self, y_pred, target)
    534             lengths = torch.full((target.size(0),), fill_value=target.size(1), dtype=torch.long, device=target.device)
    535 
--> 536         losses = self.loss(y_pred, target)
    537         # weight samples
    538         if weight is not None:

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/metrics.py in loss(self, y_pred, y_actual)
    959             torch.Tensor: metric value on which backpropagation can be applied
    960         """
--> 961         distribution = self.map_x_to_distribution(y_pred)
    962         loss = -distribution.log_prob(y_actual)
    963         return loss

~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/metrics.py in map_x_to_distribution(self, x)
   1033 
   1034     def map_x_to_distribution(self, x: torch.Tensor) -> distributions.Normal:
-> 1035         return self.distribution_class(loc=x[..., 0], scale=x[..., 1])
   1036 
   1037     def rescale_parameters(

~/.local/lib/python3.9/site-packages/torch/distributions/normal.py in __init__(self, loc, scale, validate_args)
     48         else:
     49             batch_shape = self.loc.size()
---> 50         super(Normal, self).__init__(batch_shape, validate_args=validate_args)
     51 
     52     def expand(self, batch_shape, _instance=None):

~/.local/lib/python3.9/site-packages/torch/distributions/distribution.py in __init__(self, batch_shape, event_shape, validate_args)
     53                 valid = constraint.check(value)
     54                 if not valid.all():
---> 55                     raise ValueError(
     56                         f"Expected parameter {param} "
     57                         f"({type(value).__name__} of shape {tuple(value.shape)}) "

ValueError: Expected parameter loc (Tensor of shape (4, 6)) of distribution Normal(loc: torch.Size([4, 6]), scale: torch.Size([4, 6])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan]], requires_grad=True)
lagvna commented 2 years ago

I'm facing the same issue - have you reached any conclusions, why is this happening?

15m43lk4155y commented 2 years ago

You can try gradient clipping it fixed it in my case

Titaniumtown commented 1 year ago

exact same issue. resolved by using gradient clipping. thanks @15m43lk4155y

zangyu00544 commented 4 months ago

exact same issue. gradient clipping does not work. I have tried gradient_clip_val=0.1,0.5,0.6,1.0.
do you mean the this gradient clipping? @15m43lk4155y trainer = pl.Trainer( gpus=[0] if torch.cuda.is_available() else None, max_epochs=max_epochs, gradient_clip_val=0.1, <-----this arg ? callbacks=[early_stop_callback, model_checkpt], log_every_n_steps=50)