Traceback (most recent call last):
File "/home/lee/Desktop/workspace2/project/tts/RVC/c/training/trainv1.py", line 296, in <module>
main(conf, exp_dir, training_dir, bool(args.use_pretrain))
File "/home/lee/Desktop/workspace2/project/tts/RVC/c/training/trainv1.py", line 205, in main
trainer.fit(system)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 545, in fit
call._call_and_handle_interrupt(
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 102, in launch
return function(*args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 581, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 990, in _run
results = self._run_stage()
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1036, in _run_stage
self.fit_loop.run()
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 202, in run
self.advance()
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 242, in advance
batch_output = self.manual_optimization.run(kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/manual.py", line 92, in run
self.advance(kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/manual.py", line 112, in advance
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 381, in training_step
return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 628, in __call__
wrapper_output = wrapper_module(*args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward
else self._run_ddp_forward(*inputs, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 621, in wrapped_forward
out = method(*_args, **_kwargs)
File "/home/lee/Desktop/workspace2/project/tts/RVC/c/training/core/model/systemv2.py", line 249, in training_step
loss_d, loss_g, total_loss = self.common_step(batch, batch_nb, train=True)
File "/home/lee/Desktop/workspace2/project/tts/RVC/c/training/core/model/systemv2.py", line 222, in common_step
self.manual_backward(loss_gen_all)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/core/module.py", line 1050, in manual_backward
self.trainer.strategy.backward(loss, None, *args, **kwargs)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 200, in backward
self.pre_backward(closure_loss)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 307, in pre_backward
prepare_for_backward(self.model, closure_loss)
File "/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/pytorch_lightning/overrides/distributed.py", line 55, in prepare_for_backward
reducer._rebuild_buckets() # avoids "INTERNAL ASSERT FAILED" with `find_unused_parameters=False`
RuntimeError: It looks like your LightningModule has parameters that were not used in producing the loss returned by training_step. If this is intentional, you must enable the detection of unused parameters in DDP, either by setting the string value `strategy='ddp_find_unused_parameters_true'` or by setting the flag in the strategy with `strategy=DDPStrategy(find_unused_parameters=True)`.
Epoch 0: 0%| | 0/264 [00:02<?, ?it/s]
Process finished with exit code 1
So I changed the setting to 'strategy = "ddp_find_unused_parameters_true"' and also got the following warnings:
[W reducer.cpp:1346] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
I haven't found an effective way after searching github issues.
What version are you seeing the problem on?
v2.1
How to reproduce the bug
No response
Error messages and logs
When find_unused_parameters=True, and add the following code to model:
def on_after_backward(self) -> None: print("on_after_backward enter") for name, p in self.named_parameters(): if p.grad is None: print(name) print("on_after_backward exit")
I don't know why this happen, the net_g model(gan generator) will be trained in the training process.
on_after_backward enter
net_g.flow.flows.6.enc.res_skip_layers.2.weight_v
net_g.flow.flows.6.enc.cond_layer.bias
net_g.flow.flows.6.enc.cond_layer.weight_g
net_g.flow.flows.6.enc.cond_layer.weight_v
net_g.flow.flows.6.post.weight
net_g.flow.flows.6.post.bias
net_g.emb_g.weight
on_after_backward exit
[W reducer.cpp:1346] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
on_after_backward enter
on_after_backward exit
Epoch 0: 0%| | 1/264 [00:02<10:06, 0.43it/s, v_num=15]/home/lee/Documents/software/anaconda3/envs/rvc-webuibkp/lib/python3.9/site-packages/torch/autograd/__init__.py:251: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance.
grad.sizes() = [64, 1, 4], strides() = [4, 1, 1]
bucket_view.sizes() = [64, 1, 4], strides() = [4, 4, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:320.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 1%| | 2/264 [00:03<06:58, 0.63it/s, v_num=15]on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 1%| | 3/264 [00:04<05:53, 0.74it/s, v_num=15]on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 2%|▏ | 4/264 [00:04<05:17, 0.82it/s, v_num=15]on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 2%|▏ | 5/264 [00:05<04:57, 0.87it/s, v_num=15]on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 2%|▏ | 6/264 [00:06<04:44, 0.91it/s, v_num=15]on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 3%|▎ | 7/264 [00:07<04:35, 0.93it/s, v_num=15]on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 3%|▎ | 8/264 [00:08<04:27, 0.96it/s, v_num=15]on_after_backward enter
on_after_backward exit
on_after_backward enter
on_after_backward exit
Epoch 0: 3%|▎ | 9/264 [00:09<04:22, 0.97it/s, v_num=15]on_after_backward enter
on_after_backward exit
Environment
Current environment
```
#- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow):
#- PyTorch Lightning Version (e.g., 1.5.0): 2.1.0
#- Lightning App Version (e.g., 0.5.2):
#- PyTorch Version (e.g., 2.0): 2.1.0+cu118
#- Python version (e.g., 3.9): 3.10
#- OS (e.g., Linux): ubuntu20.04
#- CUDA/cuDNN version:2.1.0+cu118
#- GPU models and configuration: rtx1080ti
#- How you installed Lightning(`conda`, `pip`, source): pip
#- Running environment of LightningApp (e.g. local, cloud): local
```
Bug description
I want to train a gan model, and use MANUAL OPTIMIZATIONhttps://lightning.ai/docs/pytorch/stable/model/manual_optimization.html to setup my training process. I got the following errors when setting 'strategy = "ddp_find_unused_parameters_false"'.
So I changed the setting to 'strategy = "ddp_find_unused_parameters_true"' and also got the following warnings:
I haven't found an effective way after searching github issues.
What version are you seeing the problem on?
v2.1
How to reproduce the bug
No response
Error messages and logs
When find_unused_parameters=True, and add the following code to model:
def on_after_backward(self) -> None: print("on_after_backward enter") for name, p in self.named_parameters(): if p.grad is None: print(name) print("on_after_backward exit")
I don't know why this happen, the net_g model(gan generator) will be trained in the training process.
Environment
Current environment
``` #- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow): #- PyTorch Lightning Version (e.g., 1.5.0): 2.1.0 #- Lightning App Version (e.g., 0.5.2): #- PyTorch Version (e.g., 2.0): 2.1.0+cu118 #- Python version (e.g., 3.9): 3.10 #- OS (e.g., Linux): ubuntu20.04 #- CUDA/cuDNN version:2.1.0+cu118 #- GPU models and configuration: rtx1080ti #- How you installed Lightning(`conda`, `pip`, source): pip #- Running environment of LightningApp (e.g. local, cloud): local ```More info
No response