Open mspils opened 1 year ago
@mspils Does this problem still occur with the latest Optuna v3.4?
Yes and no. It crashes, which is probably an improvement:
[W 2023-11-21 13:45:48,635] Trial 0 failed with parameters: {'learning_rate': 0.009733867742024538, 'n_layers': 1, 'n_units_l0': 12} because of the following error: RuntimeError('DataLoader worker (pid(s) 3999530) exited unexpectedly').
Traceback (most recent call last):
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1132, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/queue.py", line 179, in get
self.not_empty.wait(remaining)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/threading.py", line 306, in wait
gotit = waiter.acquire(True, timeout)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 3999530) is killed by signal: Aborted.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
value_or_values = func(trial)
File "optuna_issue.py", line 108, in objective
trainer.fit(model)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 989, in _run
results = self._run_stage()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run_stage
self.fit_loop.run()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 202, in run
self.advance()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 202, in advance
batch, _, __ = next(data_fetcher)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fetchers.py", line 127, in __next__
batch = super().__next__()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fetchers.py", line 56, in __next__
batch = next(self.iterator)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/utilities/combined_loader.py", line 326, in __next__
out = next(self._iterator)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/utilities/combined_loader.py", line 74, in __next__
out[i] = next(self.iterators[i])
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 633, in __next__
data = self._next_data()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in _next_data
idx, data = self._get_data()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1284, in _get_data
success, data = self._try_get_data()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1145, in _try_get_data
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 3999530) exited unexpectedly
[W 2023-11-21 13:45:48,646] Trial 0 failed with value None.
Traceback (most recent call last):
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1132, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/queue.py", line 179, in get
self.not_empty.wait(remaining)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/threading.py", line 306, in wait
gotit = waiter.acquire(True, timeout)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 3999530) is killed by signal: Aborted.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "optuna_issue.py", line 119, in <module>
study.optimize(objective, n_trials=100)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/optuna/study/study.py", line 451, in optimize
_optimize(
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 66, in _optimize
_optimize_sequential(
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 163, in _optimize_sequential
frozen_trial = _run_trial(study, func, catch)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 251, in _run_trial
raise func_err
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
value_or_values = func(trial)
File "optuna_issue.py", line 108, in objective
trainer.fit(model)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 989, in _run
results = self._run_stage()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run_stage
self.fit_loop.run()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 202, in run
self.advance()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 202, in advance
batch, _, __ = next(data_fetcher)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fetchers.py", line 127, in __next__
batch = super().__next__()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/loops/fetchers.py", line 56, in __next__
batch = next(self.iterator)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/utilities/combined_loader.py", line 326, in __next__
out = next(self._iterator)
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/pytorch_lightning/utilities/combined_loader.py", line 74, in __next__
out[i] = next(self.iterators[i])
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 633, in __next__
data = self._next_data()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in _next_data
idx, data = self._get_data()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1284, in _get_data
success, data = self._try_get_data()
File "/home/mspils/miniconda3/envs/optuna_test3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1145, in _try_get_data
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 3999530) exited unexpectedly
Epoch 0: 0%| | 0/10 [00:00<?, ?it/s]
Same issue here.
Expected behavior
When using the PyTorchLightningPruningCallback a pruned trial should resolve without errors.
Environment
Error messages, stack traces, or logs
Steps to reproduce
import pytorch_lightning as pl
import lightning.pytorch as pl
import optuna import torch from lightning.pytorch.callbacks import Callback from optuna.integration import PyTorchLightningPruningCallback from torch import nn, optim from torch.utils.data import DataLoader
torch.set_float32_matmul_precision('high') BATCHSIZE = 1024 EPOCHS = 50 ACCELERATOR = 'cuda' DEVICES = [1]
class OptunaPruningCallback(PyTorchLightningPruningCallback, Callback): """Custom optuna Pruning Callback, because CUDA/Lightning do not play well with the default one.
class ToyDataSet(torch.utils.data.Dataset): def init(self, count): super(ToyDataSet).init() self.x = torch.rand(count,dtype=torch.float32) self.y = torch.rand(count,dtype=torch.float32) self.count = count
class LightningNet(pl.LightningModule): def init(self, output_dims,learning_rate) -> None: super().init() layers = [] input_dim = 1 for output_dim in output_dims: layers.append(nn.Linear(input_dim, output_dim)) layers.append(nn.ReLU()) input_dim = output_dim layers.append(nn.Linear(input_dim, 1))
def objective(trial: optuna.trial.Trial) -> float: learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True) n_layers = trial.suggest_int("n_layers", 1, 2) output_dims = [trial.suggest_int(f"n_units_l{i}", 4, 64, log=True) for i in range(n_layers)]
if name == "main": study = optuna.create_study( direction="minimize", pruner= optuna.pruners.HyperbandPruner(min_resource=1, max_resource='auto', reduction_factor=3, bootstrap_count=0), load_if_exists=True)