unit8co / darts

A python library for user-friendly forecasting and anomaly detection on time series.
https://unit8co.github.io/darts/
Apache License 2.0
8.1k stars 881 forks source link

RuntimeError: Lightning can't create new processes if CUDA is already initialized. #1901

Closed iDestro closed 1 year ago

iDestro commented 1 year ago

Describe the bug I encounter a problem that says RuntimeError: Lightning can't create new processes if CUDA is already initialized.

To Reproduce

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from darts.dataprocessing.transformers import Scaler
from darts.models import RNNModel
from darts.metrics import mape
from darts.datasets import AirPassengersDataset

# Read data:
series = AirPassengersDataset().load()
series = series.astype(np.float32)

# Create training and validation sets:
train, val = series.split_after(pd.Timestamp("19590101"))

# Normalize the time series (note: we avoid fitting the transformer on the validation set)
transformer = Scaler()
train_transformed = transformer.fit_transform(train)
val_transformed = transformer.transform(val)
series_transformed = transformer.transform(series)

torch.cuda.set_device(0)

my_model = RNNModel(
    model="RNN",
    hidden_dim=20,
    dropout=0,
    batch_size=16,
    n_epochs=300,
    optimizer_kwargs={"lr": 1e-3},
    model_name="Air_RNN",
    log_tensorboard=True,
    random_state=42,
    training_length=20,
    input_chunk_length=14,
    force_reset=True,
)
my_model.fit(train_transformed, val_series=val_transformed)

Expected behavior I want it works properly.

System (please complete the following information):

Additional context Detail error:

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[9], line 40
     24 torch.cuda.set_device(0)
     26 my_model = RNNModel(
     27     model="RNN",
     28     hidden_dim=20,
   (...)
     38     force_reset=True,
     39 )
---> 40 my_model.fit(train_transformed, val_series=val_transformed)

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/darts/utils/torch.py:112, in random_method.<locals>.decorator(self, *args, **kwargs)
    110 with fork_rng():
    111     manual_seed(self._random_instance.randint(0, high=MAX_TORCH_SEED_VALUE))
--> 112     return decorated(self, *args, **kwargs)

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/darts/models/forecasting/torch_forecasting_model.py:705, in TorchForecastingModel.fit(self, series, past_covariates, future_covariates, val_series, val_past_covariates, val_future_covariates, trainer, verbose, epochs, max_samples_per_ts, num_loader_workers)
    699 # call super fit only if user is actually fitting the model
    700 super().fit(
    701     series=seq2series(series),
    702     past_covariates=seq2series(past_covariates),
    703     future_covariates=seq2series(future_covariates),
    704 )
--> 705 return self.fit_from_dataset(*params)

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/darts/utils/torch.py:112, in random_method.<locals>.decorator(self, *args, **kwargs)
    110 with fork_rng():
    111     manual_seed(self._random_instance.randint(0, high=MAX_TORCH_SEED_VALUE))
--> 112     return decorated(self, *args, **kwargs)

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/darts/models/forecasting/torch_forecasting_model.py:897, in TorchForecastingModel.fit_from_dataset(self, train_dataset, val_dataset, trainer, verbose, epochs, num_loader_workers)
    846 @random_method
    847 def fit_from_dataset(
    848     self,
   (...)
    854     num_loader_workers: int = 0,
    855 ) -> "TorchForecastingModel":
    856     """
    857     Train the model with a specific :class:`darts.utils.data.TrainingDataset` instance.
    858     These datasets implement a PyTorch ``Dataset``, and specify how the target and covariates are sliced
   (...)
    895         Fitted model.
    896     """
--> 897     self._train(
    898         *self._setup_for_train(
    899             train_dataset=train_dataset,
    900             val_dataset=val_dataset,
    901             trainer=trainer,
    902             verbose=verbose,
    903             epochs=epochs,
    904             num_loader_workers=num_loader_workers,
    905         )
    906     )
    907     return self

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/darts/models/forecasting/torch_forecasting_model.py:1041, in TorchForecastingModel._train(self, trainer, model, train_loader, val_loader)
   1038 ckpt_path = self.load_ckpt_path
   1039 self.load_ckpt_path = None
-> 1041 trainer.fit(
   1042     model,
   1043     train_dataloaders=train_loader,
   1044     val_dataloaders=val_loader,
   1045     ckpt_path=ckpt_path,
   1046 )
   1047 self.model = model
   1048 self.trainer = trainer

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:531, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    529 model = _maybe_unwrap_optimized(model)
    530 self.strategy._lightning_module = model
--> 531 call._call_and_handle_interrupt(
    532     self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    533 )

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:41, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     39 try:
     40     if trainer.strategy.launcher is not None:
---> 41         return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
     42     return trainer_fn(*args, **kwargs)
     44 except _TunerExitException:

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py:99, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
     97 self._check_torchdistx_support()
     98 if self._start_method in ("fork", "forkserver"):
---> 99     _check_bad_cuda_fork()
    101 # The default cluster environment in Lightning chooses a random free port number
    102 # This needs to be done in the main process here before starting processes to ensure each rank will connect
    103 # through the same port
    104 assert self._strategy.cluster_environment is not None

File /data/tushihao/miniconda3/envs/jupyter/lib/python3.9/site-packages/lightning_fabric/strategies/launchers/multiprocessing.py:189, in _check_bad_cuda_fork()
    187 if _IS_INTERACTIVE:
    188     message += " You will have to restart the Python kernel."
--> 189 raise RuntimeError(message)

RuntimeError: Lightning can't create new processes if CUDA is already initialized. Did you manually call `torch.cuda.*` functions, have moved the model to the device, or allocated memory on the GPU any other way? Please remove any such calls, or change the selected strategy. You will have to restart the Python kernel.
dennisbader commented 1 year ago

Hi @iDestro, Darts TorchForecastingModels are built on top of pytorch-lightning which handles setting the devices for you behind the hood.

This user guide shows how to use GPU with Darts.

iDestro commented 1 year ago

Hi @iDestro, Darts TorchForecastingModels are built on top of pytorch-lightning which handles setting the devices for you behind the hood.

This user guide shows how to use GPU with Darts.

I have resolved this problem by adding os.environ['CUDA_VISIBLE_DEVICES']='0' in the begin of my code.