Hi. I have Pytorch Lightning 1.6.0 and I'm getting the following error:
Epoch 0: 0%| | 0/846 [00:00<?, ?it/s]Fatal Python error: Floating point exception
Thread 0x00007f7fe9fff700 (most recent call first):
File "/opt/conda/lib/python3.8/threading.py", line 306 in wait
File "/opt/conda/lib/python3.8/threading.py", line 558 in wait
File "/opt/conda/lib/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/opt/conda/lib/python3.8/threading.py", line 932 in _bootstrap_inner
File "/opt/conda/lib/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7e3d312700 (most recent call first):
File "/opt/conda/lib/python3.8/threading.py", line 306 in wait
File "/opt/conda/lib/python3.8/queue.py", line 179 in get
File "/opt/conda/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 227 in run
File "/opt/conda/lib/python3.8/threading.py", line 932 in _bootstrap_inner
File "/opt/conda/lib/python3.8/threading.py", line 890 in _bootstrap
Current thread 0x00007f8c41d760c0 (most recent call first):
File "/run/determined/workdir/stable-diffusion/ldm/modules/diffusionmodules/util.py", line 162 in timestep_embedding
File "/run/determined/workdir/stable-diffusion/ldm/modules/diffusionmodules/openaimodel.py", line 758 in forward
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110 in _call_impl
File "/run/determined/workdir/stable-diffusion/ldm/models/diffusion/ddpm.py", line 1458 in forward
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110 in _call_impl
File "/run/determined/workdir/stable-diffusion/ldm/models/diffusion/ddpm.py", line 993 in apply_model
File "/run/determined/workdir/stable-diffusion/ldm/models/diffusion/ddpm.py", line 1021 in p_losses
File "/run/determined/workdir/stable-diffusion/ldm/models/diffusion/ddpm.py", line 885 in forward
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110 in _call_impl
File "/run/determined/workdir/stable-diffusion/ldm/models/diffusion/ddpm.py", line 873 in shared_step
File "/run/determined/workdir/stable-diffusion/ldm/models/diffusion/ddpm.py", line 406 in training_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 82 in forward
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110 in _call_impl
File "/opt/conda/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963 in forward
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110 in _call_impl
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 344 in training_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1766 in _call_strategy_hook
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 427 in _training_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 134 in closure
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148 in __call__
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 140 in _wrap_closure
File "/opt/conda/lib/python3.8/site-packages/torch/optim/adamw.py", line 100 in step
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27 in decorate_context
File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 88 in wrapper
File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 65 in wrapper
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 155 in optimizer_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 193 in optimizer_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 278 in optimizer_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168 in step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1625 in optimizer_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1596 in _call_lightning_module_hook
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 369 in _optimizer_step
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256 in _run_optimization
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 203 in advance
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 204 in run
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88 in advance
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 204 in run
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208 in advance
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 204 in run
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 269 in advance
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 204 in run
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1354 in _run_train
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1324 in _run_stage
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1237 in _run
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 812 in _fit_impl
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93 in launch
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 722 in _call_and_handle_interrupt
File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 771 in fit
File "main.py", line 816 in <module>
/bin/bash: line 1: 61125 Floating point exception(core dumped) ( python main.py -t --base configs/stable-diffusion/pokemon.yaml --gpus 1 --scale_lr False --num_nodes 1 --check_val_every_n_epoch 20 --finetune_from "/root/.cache/huggingface/hub/models--CompVis--stable-diffusion-v-1-4-original/snapshots/f0bb45b49990512c454cf2c5670b0952ef2f9c71/sd-v1-4-full-ema.ckpt" data.params.batch_size="1" lightning.trainer.accumulate_grad_batches="1" data.params.validation.params.n_gpus=1 )
The code its pointing at in main.py is this:
trainer.fit(model, data)
except Exception:
if not opt.debug:
melk()
raise
if not opt.no_test and not trainer.interrupted:
trainer.test(model, data)
Hi. I have Pytorch Lightning 1.6.0 and I'm getting the following error:
The code its pointing at in main.py is this:
Where am I going wrong?