Closed myxyy closed 10 months ago
make testでtests/trainers/test_forward_dynamics_trainer.pyに対し以下のエラーが発生
make test
tests/trainers/test_forward_dynamics_trainer.py
=========================================================================================== FAILURES ============================================================================================ ___________________________________________________________________ TestForwardDynamicsTrainer.test_forward_dynamics_trainer ____________________________________________________________________ self = <tests.trainers.test_forward_dynamics_trainer.TestForwardDynamicsTrainer object at 0x7f4cdbddeda0> forward_dynamics_trainer = <src.trainers.simple_pl_trainer.SimplePLTrainer object at 0x7f4c7e5214e0> def test_forward_dynamics_trainer(self, forward_dynamics_trainer): > forward_dynamics_trainer.train() tests/trainers/test_forward_dynamics_trainer.py:102: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ src/trainers/simple_pl_trainer.py:28: in train self.pl_trainer.fit(self.module, dataloader) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py:545: in fit call._call_and_handle_interrupt( ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py:43: in _call_and_handle_interrupt return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py:102: in launch return function(*args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py:581: in _fit_impl self._run(model, ckpt_path=ckpt_path) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py:990: in _run results = self._run_stage() ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py:1036: in _run_stage self.fit_loop.run() ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py:202: in run self.advance() ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py:359: in advance self.epoch_loop.run(self._data_fetcher) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py:136: in run self.advance(data_fetcher) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py:240: in advance batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py:187: in run self._optimizer_step(batch_idx, closure) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py:265: in _optimizer_step call._call_lightning_module_hook( ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py:157: in _call_lightning_module_hook output = fn(*args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/core/module.py:1282: in optimizer_step optimizer.step(closure=optimizer_closure) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/core/optimizer.py:151: in step step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/strategies/ddp.py:263: in optimizer_step optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py:230: in optimizer_step return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py:117: in optimizer_step return optimizer.step(closure=closure, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/torch/optim/optimizer.py:280: in wrapper out = func(*args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/torch/optim/optimizer.py:33: in _use_grad ret = func(self, *args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/torch/optim/adam.py:121: in step loss = closure() ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py:104: in _wrap_closure closure_result = closure() ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py:140: in __call__ self._result = self.closure(*args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/torch/utils/_contextlib.py:115: in decorate_context return func(*args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py:126: in closure step_output = self._step_fn() ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/loops/optimization/automatic.py:315: in _training_step training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py:309: in _call_strategy_hook output = fn(*args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py:381: in training_step return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py:628: in __call__ wrapper_output = wrapper_module(*args, **kwargs) ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: in _call_impl return forward_call(*args, **kwargs) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = DistributedDataParallel( (module): ForwardDynamicsLitModule( (obs_encoder): CNNObservationEncoder( (conv_n...ures=512, bias=True) (fc3): Linear(in_features=512, out_features=256, bias=True) (relu): ReLU() ) ) ) inputs = ([tensor([[ 0.5301, 0.8835, 2.0410, -1.2915, -1.9475, -0.4497, -0.9799, -1.0383, 0.3920, 0.3544, 1.3345,...5746, 1.4004], [-0.1153, -0.6541, -1.7018, ..., -1.6362, -0.5747, -0.3372]]]], device='cuda:0')], 1) kwargs = {}, work = None def forward(self, *inputs, **kwargs): with torch.autograd.profiler.record_function( "DistributedDataParallel.forward" ): if torch.is_grad_enabled() and self.require_backward_grad_sync: assert self.logger is not None self.logger.set_runtime_stats_and_log() self.num_iterations += 1 self.reducer.prepare_for_forward() # Notify the join context that this process has not joined, if # needed work = Join.notify_join_context(self) if work: self.reducer._set_forward_pass_work_handle( work, self._divide_by_initial_world_size # type: ignore[arg-type] ) # Calling _rebuild_buckets before forward compuation, # It may allocate new buckets before deallocating old buckets # inside _rebuild_buckets. To save peak memory usage, # call _rebuild_buckets before the peak memory usage increases # during forward computation. # This should be called only once during whole training period. > if torch.is_grad_enabled() and self.reducer._rebuild_buckets(): E RuntimeError: It looks like your LightningModule has parameters that were not used in producing the loss returned by training_step. If this is intentional, you must enable the detection of unused parameters in DDP, either by setting the string value `strategy='ddp_find_unused_parameters_true'` or by setting the flag in the strategy with `strategy=DDPStrategy(find_unused_parameters=True)`. ../.cache/pypoetry/virtualenvs/primitive-ami-fojGUKIN-py3.10/lib/python3.10/site-packages/torch/nn/parallel/distributed.py:1139: RuntimeError
CPU: Ryzen Threadripper 1950x RAM: 128GB GPU: RTX3090 x 3
make test実行
テストが通らないです
pl.Trainer()はデフォルトでマルチGPUを使おうとしていてDDP周りのエラーが発生する?
pl.Trainer()
pl.Trainer()にdevices=1を追加
devices=1
修正したのでCloseします
概要
make test
でtests/trainers/test_forward_dynamics_trainer.py
に対し以下のエラーが発生発生環境
CPU: Ryzen Threadripper 1950x RAM: 128GB GPU: RTX3090 x 3
再現手順
make test
実行修正しないとどう困るか
テストが通らないです
原因
pl.Trainer()
はデフォルトでマルチGPUを使おうとしていてDDP周りのエラーが発生する?修正案
pl.Trainer()
にdevices=1
を追加