jdb78 / pytorch-forecasting

Time series forecasting with PyTorch
https://pytorch-forecasting.readthedocs.io/
MIT License
3.87k stars 611 forks source link

Retrain NBEATs Model #638

Closed aamrb96 closed 3 years ago

aamrb96 commented 3 years ago

Hi all,

I am currently working with the NBEATs implementation in this package. I wrapped all the training code from the tutorial into a function called `nbeats_train:

def nbeats_train(data,
                 context_length: int = context_length,
                 prediction_length: int = prediction_length,
                 batch_size: int = 128,
                 trained_model = None
                ):

    training_cutoff = data["time_idx"].max() - max_prediction_length

    training = TimeSeriesDataSet(
        data[lambda x: x.time_idx <= training_cutoff],
        time_idx="time_idx",
        target="value",
        categorical_encoders={"identifier": NaNLabelEncoder().fit(data["identifier"])},
        group_ids=["identifier"],

        # only unknown variable is "value" - and N-Beats can also not take any additional variables
        time_varying_unknown_reals=["value"],
        allow_missing_timesteps = True, 
        max_encoder_length=context_length,
        max_prediction_length=prediction_length,
    )

    validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff + 1)
    train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=4)
    val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=4)

    pl.seed_everything(42)
    trainer = pl.Trainer(gpus=0, gradient_clip_val=0.01)
    net = NBeats.from_dataset(training, learning_rate=3e-2, weight_decay=1e-2, widths=[32, 512], backcast_loss_ratio=0.1)

    res = trainer.tuner.lr_find(net, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5)
    print(f"suggested learning rate: {res.suggestion()}")
    fig = res.plot(show=True, suggest=True)
    fig.show()
    net.hparams.learning_rate = res.suggestion

    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")

    trainer = pl.Trainer(
        max_epochs=100,
        gpus=0,
        weights_summary="top",
        gradient_clip_val=0.01,
        callbacks=[early_stop_callback],
        limit_train_batches=30,
    )
    # Provided after first data set has been used for training.
    if trained_model is None:

        net = NBeats.from_dataset(
            training,
            learning_rate=4e-3,
            log_interval=10,
            log_val_interval=1,
            weight_decay=1e-2,
            widths=[32, 512],
            backcast_loss_ratio=1.0,
        )

    else:
        net = trained_model

    trainer.fit(
        net,
        train_dataloader=train_dataloader,
        val_dataloaders=val_dataloader,
    )

    return net

I would like to train it on three very large datasets (the datasets can not be appended into one large dataset due to hardware limitations).

Thus I wrote a loop that iterates over my data dictionary (data = {"dataset_name" : TimeSeriesDataSet Object}). After the first iteration, I would like to initialize the weights of the model after training it with each data set with the weights of the previous iteration. Thus, I wrote a loop like this:

for iteration, key in enumerate(iter(m4_data.keys())):
    print("{}/{} Done".format(iteration + 1, len(data.keys())))

    if iteration == 0:
        nbeats_trained = nbeats_train(data = datakey])

    else:        
        nbeats_trained = nbeats_train(data = data[key],
                                     trained_model = nbeats_trained)

However, I receive the following error:

RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_121057/432736534.py in <module>
     10             param.requires_grad = False
     11 
---> 12         nbeats_trained = nbeats_train(data = m4_data[key],
     13                                      trained_model = nbeats_trained)
     14 

/tmp/ipykernel_121057/3884378714.py in nbeats_train(data, context_length, prediction_length, batch_size, trained_model)
     62         net = trained_model
     63 
---> 64     trainer.fit(
     65         net,
     66         train_dataloader=train_dataloader,

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
    458         )
    459 
--> 460         self._run(model)
    461 
    462         assert self.state.stopped

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
    756 
    757         # dispatch `start_training` or `start_evaluating` or `start_predicting`
--> 758         self.dispatch()
    759 
    760         # plugin will finalized fitting (e.g. ddp_spawn will load trained model)

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in dispatch(self)
    797             self.accelerator.start_predicting(self)
    798         else:
--> 799             self.accelerator.start_training(self)
    800 
    801     def run_stage(self):

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
     94 
     95     def start_training(self, trainer: 'pl.Trainer') -> None:
---> 96         self.training_type_plugin.start_training(trainer)
     97 
     98     def start_evaluating(self, trainer: 'pl.Trainer') -> None:

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
    142     def start_training(self, trainer: 'pl.Trainer') -> None:
    143         # double dispatch to initiate the training loop
--> 144         self._results = trainer.run_stage()
    145 
    146     def start_evaluating(self, trainer: 'pl.Trainer') -> None:

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
    807         if self.predicting:
    808             return self.run_predict()
--> 809         return self.run_train()
    810 
    811     def _pre_training_routine(self):

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
    869                 with self.profiler.profile("run_training_epoch"):
    870                     # run train epoch
--> 871                     self.train_loop.run_training_epoch()
    872 
    873                 if self.max_steps and self.max_steps <= self.global_step:

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
    497             # ------------------------------------
    498             with self.trainer.profiler.profile("run_training_batch"):
--> 499                 batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
    500 
    501             # when returning -1 from train_step, we end epoch early

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_batch(self, batch, batch_idx, dataloader_idx)
    736 
    737                         # optimizer step
--> 738                         self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    739                         if len(self.trainer.optimizers) > 1:
    740                             # revert back to previous state

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    432 
    433         # model hook
--> 434         model_ref.optimizer_step(
    435             self.trainer.current_epoch,
    436             batch_idx,

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
   1401 
   1402         """
-> 1403         optimizer.step(closure=optimizer_closure)
   1404 
   1405     def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py in step(self, closure, *args, **kwargs)
    212             profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
    213 
--> 214         self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
    215         self._total_optimizer_step_calls += 1
    216 

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py in __optimizer_step(self, closure, profiler_name, **kwargs)
    132 
    133         with trainer.profiler.profile(profiler_name):
--> 134             trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
    135 
    136     def step(self, *args, closure: Optional[Callable] = None, **kwargs):

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in optimizer_step(self, optimizer, opt_idx, lambda_closure, **kwargs)
    327         )
    328         if make_optimizer_step:
--> 329             self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
    330         self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
    331         self.training_type_plugin.post_optimizer_step(optimizer, opt_idx, **kwargs)

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in run_optimizer_step(self, optimizer, optimizer_idx, lambda_closure, **kwargs)
    334         self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any
    335     ) -> None:
--> 336         self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
    337 
    338     def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in optimizer_step(self, optimizer, lambda_closure, **kwargs)
    191 
    192     def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
--> 193         optimizer.step(closure=lambda_closure, **kwargs)
    194 
    195     @property

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/torch/optim/optimizer.py in wrapper(*args, **kwargs)
     86                 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
     87                 with torch.autograd.profiler.record_function(profile_name):
---> 88                     return func(*args, **kwargs)
     89             return wrapper
     90 

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_forecasting/optim.py in step(self, closure)
    129             closure: A closure that reevaluates the model and returns the loss.
    130         """
--> 131         _ = closure()
    132         loss = None
    133         # note - below is commented out b/c I have other work that passes back

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in train_step_and_backward_closure()
    730 
    731                         def train_step_and_backward_closure():
--> 732                             result = self.training_step_and_backward(
    733                                 split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
    734                             )

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
    834                 if result is not None:
    835                     with self.trainer.profiler.profile("backward"):
--> 836                         self.backward(result, optimizer, opt_idx)
    837 
    838                     # hook - call this hook only

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in backward(self, result, optimizer, opt_idx, *args, **kwargs)
    867             self.trainer.accelerator.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs)
    868         else:
--> 869             result.closure_loss = self.trainer.accelerator.backward(
    870                 result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
    871             )

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in backward(self, closure_loss, optimizer, optimizer_idx, should_accumulate, *args, **kwargs)
    306         self.training_type_plugin.pre_backward(closure_loss, should_accumulate, optimizer, optimizer_idx)
    307 
--> 308         output = self.precision_plugin.backward(
    309             self.lightning_module, closure_loss, optimizer, optimizer_idx, should_accumulate, *args, **kwargs
    310         )

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py in backward(self, model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
     77         # do backward pass
     78         if automatic_optimization:
---> 79             model.backward(closure_loss, optimizer, opt_idx)
     80         else:
     81             closure_loss.backward(*args, **kwargs)

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py in backward(self, loss, optimizer, optimizer_idx, *args, **kwargs)
   1273         """
   1274         if self.automatic_optimization or self._running_manual_backward:
-> 1275             loss.backward(*args, **kwargs)
   1276 
   1277     def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int):

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    253                 create_graph=create_graph,
    254                 inputs=inputs)
--> 255         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    256 
    257     def register_hook(self, hook):

~/Desktop/Python Projects/nbeats/venv/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    145         retain_graph = create_graph
    146 
--> 147     Variable._execution_engine.run_backward(
    148         tensors, grad_tensors_, retain_graph, create_graph, inputs,
    149         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Does anyone know where the problem comes from/how to solve this?

Thanks in advance!

aamrb96 commented 3 years ago

I found a simple solution to this for everyone else that might be facing a similar problem. Don't read the checkpoint/trained model from the disk but simply run the loop with the trainer element from the first run again.

for data in files:

  training = ...
  validation = ...

  train_dataloader = ...
  val_dataloader = ...

  trainer = ...
  nets = NBEATs.from_dataset(training...)

  trainer.fit(net, train_dataloader, val_dataloader)