Error passing CUDA tensor to nn.utils.rnn.pack_padded_sequence

yonghah commented 4 years ago

PyTorch-Forecasting version: v0.5.3
PyTorch version: 1.7.0
Python version: 3.7.9
Operating System: Ubuntu 20.04.1 LTS

Expected behavior

I executed codes to find optimal learning rate or to fit network and and expected to get result as written in pytorch-forecasting.readthedocs.io. The only difference was gpus=1 in pl.Trainer parameter.

# configure network and trainer
pl.seed_everything(42)
trainer = pl.Trainer(
    gpus=1,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)

tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# find optimal learning rate
res = trainer.tuner.lr_find(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

Actual behavior

However, it gives RuntimeError like below:

RuntimeError                              Traceback (most recent call last)
<ipython-input-11-a92b5627800b> in <module>
      5     val_dataloaders=val_dataloader,
      6     max_lr=10.0,
----> 7     min_lr=1e-6,
      8 )
      9 

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/tuner/tuning.py in lr_find(self, model, train_dataloader, val_dataloaders, min_lr, max_lr, num_training, mode, early_stop_threshold, datamodule)
    128             mode,
    129             early_stop_threshold,
--> 130             datamodule,
    131         )
    132 

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/tuner/lr_finder.py in lr_find(trainer, model, train_dataloader, val_dataloaders, min_lr, max_lr, num_training, mode, early_stop_threshold, datamodule)
    173                 train_dataloader=train_dataloader,
    174                 val_dataloaders=val_dataloaders,
--> 175                 datamodule=datamodule)
    176 
    177     # Prompt if we stopped early

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
    437         self.call_hook('on_fit_start')
    438 
--> 439         results = self.accelerator_backend.train()
    440         self.accelerator_backend.teardown()
    441 

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py in train(self)
     52 
     53         # train or test
---> 54         results = self.train_or_test()
     55         return results
     56 

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py in train_or_test(self)
     64             results = self.trainer.run_test()
     65         else:
---> 66             results = self.trainer.train()
     67         return results
     68 

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in train(self)
    459 
    460     def train(self):
--> 461         self.run_sanity_check(self.get_model())
    462 
    463         # enable train mode

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_sanity_check(self, ref_model)
    645 
    646             # run eval step
--> 647             _, eval_results = self.run_evaluation(test_mode=False, max_batches=self.num_sanity_val_batches)
    648 
    649             # allow no returns from eval

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_evaluation(self, test_mode, max_batches)
    565 
    566                 # lightning module methods
--> 567                 output = self.evaluation_loop.evaluation_step(test_mode, batch, batch_idx, dataloader_idx)
    568                 output = self.evaluation_loop.evaluation_step_end(output)
    569 

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/trainer/evaluation_loop.py in evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx)
    169             output = self.trainer.accelerator_backend.test_step(args)
    170         else:
--> 171             output = self.trainer.accelerator_backend.validation_step(args)
    172 
    173         # track batch size for weighted average

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py in validation_step(self, args)
     76                 output = self.__validation_step(args)
     77         else:
---> 78             output = self.__validation_step(args)
     79 
     80         return output

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py in __validation_step(self, args)
     84         batch = self.to_device(batch)
     85         args[0] = batch
---> 86         output = self.trainer.model.validation_step(*args)
     87         return output
     88 

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_forecasting/models/base_model.py in validation_step(self, batch, batch_idx)
    138     def validation_step(self, batch, batch_idx):
    139         x, y = batch
--> 140         log, _ = self.step(x, y, batch_idx, label="val")  # log loss
    141         self.log("val_loss", log["loss"], on_step=False, on_epoch=True, prog_bar=True)
    142         return log

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py in step(self, x, y, batch_idx, label)
    566         """
    567         # extract data and run model
--> 568         log, out = super().step(x, y, batch_idx, label=label)
    569         # calculate interpretations etc for latter logging
    570         if self.log_interval(label == "train") > 0:

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_forecasting/models/base_model.py in step(self, x, y, batch_idx, label)
    194             loss = loss * (1 + monotinicity_loss)
    195         else:
--> 196             out = self(x)
    197             out["prediction"] = self.transform_output(out)
    198 

~/repo/emart-promo/env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/repo/emart-promo/env/lib/python3.7/site-packages/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py in forward(self, x)
    489         encoder_output, (hidden, cell) = self.lstm_encoder(
    490             rnn.pack_padded_sequence(
--> 491                 embeddings_varying_encoder, lstm_encoder_lengths, enforce_sorted=False, batch_first=True
    492             ),
    493             (input_hidden, input_cell),

~/repo/emart-promo/env/lib/python3.7/site-packages/torch/nn/utils/rnn.py in pack_padded_sequence(input, lengths, batch_first, enforce_sorted)
    242 
    243     data, batch_sizes = \
--> 244         _VF._pack_padded_sequence(input, lengths, batch_first)
    245     return _packed_sequence_init(data, batch_sizes, sorted_indices, None)
    246 

RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor

Seems related to these issues:

jdb78 commented 4 years ago

Good point. Thanks for reporting this! The behaviour might have changed with PyTorch 1.7 again. Currently there are no tests on GPU (and it used to work a couple of weeks ago) but I am planning some soonish.

BeHappyForMe commented 3 years ago

Wonderful job! I have take the lastest version master, why it happen still? How can i solve this, thanks: pytorch: 1.7.0+cu101 pl :1.0.6 pytorch_forecasting :0.6.0

BeHappyForMe commented 3 years ago

just ：y = rnn.pack_padded_sequence(y, lengths=x["decoder_lengths"].cpu(), batch_first=True, enforce_sorted=False)

ML-IEE commented 3 years ago

@BeHappyForMe I had the same issue and the .cpu() fix did it for me.

Clickative commented 3 years ago

This error is still persistent.

jdb78 commented 3 years ago

Missed this one. Fixed in #169

sktime / pytorch-forecasting

Error passing CUDA tensor to nn.utils.rnn.pack_padded_sequence #135

Expected behavior

Actual behavior