In order to avoid basic errors in Google Colab, I realized that it is necessary to at least make such changes to the files and environment:
workers in ljspeech.yaml replace with 2 instead of 20
!pip install --upgrade diffusers
!pip install --upgrade torch
!pip install --upgrade torchaudio
Replace in pflow/text/symbols.py _punctuation = ';:,.!?¡¿—…"«»“” ()[]'
But even after that, errors of this kind are obtained:
[2024-10-30 12:07:47,682][pflow.utils.utils][ERROR] -
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
return trainer_fn(*args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run
results = self._run_stage()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1023, in _run_stage
self._run_sanity_check()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1052, in _run_sanity_check
val_loop.run()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/utilities.py", line 178, in _decorator
return loop_run(self, *args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 135, in run
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 396, in _evaluation_step
output = call._call_strategy_hook(trainer, hook_name, step_args)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook
output = fn(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 411, in validation_step
return self.lightning_module.validation_step(*args, kwargs)
File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 150, in validation_step
loss_dict, attn_dict = self.get_losses(batch)
File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 73, in get_losses
dur_loss, prior_loss, diff_loss, attn = self(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, *kwargs)
File "/content/pflowtts_pytorch/pflow/models/pflow_tts.py", line 125, in forward
mu_x, logw, x_mask = self.encoder(x, x_lengths, prompt_slice)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, *kwargs)
File "/content/pflowtts_pytorch/pflow/models/components/speech_prompt_encoder.py", line 569, in forward
x_emb = self.emb(x_input) math.sqrt(self.n_channels)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/content/pflowtts_pytorch/pflow/utils/utils.py", line 76, in wrap
metric_dict, object_dict = task_func(cfg=cfg)
File "/content/pflowtts_pytorch/pflow/train.py", line 97, in train
trainer.fit(model=model, datamodule=datamodule)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 68, in _call_and_handle_interrupt
trainer._teardown()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1004, in _teardown
self.strategy.teardown()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 535, in teardown
self.lightning_module.cpu()
File "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/device_dtype_mixin.py", line 82, in cpu
return super().cpu()
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in cpu
return self._apply(lambda t: t.cpu())
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply
module._apply(fn)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply
module._apply(fn)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 927, in _apply
param_applied = fn(param)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in
return self._apply(lambda t: t.cpu())
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
[2024-10-30 12:07:47,697][pflow.utils.utils][INFO] - Output dir: /content/pflowtts_pytorch/logs/train/ljspeech/runs/2024-10-30_12-07-30
Error executing job with overrides: ['experiment=ljspeech']
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
return trainer_fn(*args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run
results = self._run_stage()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1023, in _run_stage
self._run_sanity_check()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1052, in _run_sanity_check
val_loop.run()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/utilities.py", line 178, in _decorator
return loop_run(self, *args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 135, in run
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 396, in _evaluation_step
output = call._call_strategy_hook(trainer, hook_name, step_args)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook
output = fn(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 411, in validation_step
return self.lightning_module.validation_step(*args, kwargs)
File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 150, in validation_step
loss_dict, attn_dict = self.get_losses(batch)
File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 73, in get_losses
dur_loss, prior_loss, diff_loss, attn = self(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, *kwargs)
File "/content/pflowtts_pytorch/pflow/models/pflow_tts.py", line 125, in forward
mu_x, logw, x_mask = self.encoder(x, x_lengths, prompt_slice)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, *kwargs)
File "/content/pflowtts_pytorch/pflow/models/components/speech_prompt_encoder.py", line 569, in forward
x_emb = self.emb(x_input) math.sqrt(self.n_channels)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/content/pflowtts_pytorch/pflow/train.py", line 130, in main
metricdict, = train(cfg)
File "/content/pflowtts_pytorch/pflow/utils/utils.py", line 86, in wrap
raise ex
File "/content/pflowtts_pytorch/pflow/utils/utils.py", line 76, in wrap
metric_dict, object_dict = task_func(cfg=cfg)
File "/content/pflowtts_pytorch/pflow/train.py", line 97, in train
trainer.fit(model=model, datamodule=datamodule)
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 68, in _call_and_handle_interrupt
trainer._teardown()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1004, in _teardown
self.strategy.teardown()
File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 535, in teardown
self.lightning_module.cpu()
File "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/device_dtype_mixin.py", line 82, in cpu
return super().cpu()
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in cpu
return self._apply(lambda t: t.cpu())
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply
module._apply(fn)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply
module._apply(fn)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 927, in _apply
param_applied = fn(param)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in
return self._apply(lambda t: t.cpu())
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
Exception raised from c10_cuda_check_implementation at ../c10/cuda/CUDAException.cpp:43 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x78c8b68b9446 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const, char const, unsigned int, std::string const&) + 0x64 (0x78c8b68636e4 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #2: c10::cuda::c10_cuda_check_implementation(int, char const, char const, int, bool) + 0x118 (0x78c8b69a5a18 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #3: + 0x1021c88 (0x78c86c67fc88 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #4: + 0x102a735 (0x78c86c688735 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #5: + 0x5faf70 (0x78c8b579af70 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #6: + 0x6f69f (0x78c8b689a69f in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #7: c10::TensorImpl::~TensorImpl() + 0x21b (0x78c8b689337b in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #8: c10::TensorImpl::~TensorImpl() + 0x9 (0x78c8b6893529 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #9: + 0x8c1a98 (0x78c8b5a61a98 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #10: THPVariable_subclass_dealloc(_object*) + 0x2c6 (0x78c8b5a61de6 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #36: + 0x29d90 (0x78c8b8716d90 in /lib/x86_64-linux-gnu/libc.so.6)
frame #37: __libc_start_main + 0x80 (0x78c8b8716e40 in /lib/x86_64-linux-gnu/libc.so.6)
I tried to change max_epochs in trainer, every_n_epochs in model_checkpoint.yaml and batch_size in ljspeech.yaml, but this did not help and caused new errors like:
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [13,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
I also don't really understand the interrelationships of all this, I just wanted to try to train in order to take a step towards understanding. I'm sorry for this issue.
In order to avoid basic errors in Google Colab, I realized that it is necessary to at least make such changes to the files and environment:
workers in ljspeech.yaml replace with 2 instead of 20 !pip install --upgrade diffusers !pip install --upgrade torch !pip install --upgrade torchaudio Replace in pflow/text/symbols.py _punctuation = ';:,.!?¡¿—…"«»“” ()[]'
But even after that, errors of this kind are obtained:
[2024-10-30 12:07:47,682][pflow.utils.utils][ERROR] - Traceback (most recent call last): File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt return trainer_fn(*args, kwargs) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run results = self._run_stage() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1023, in _run_stage self._run_sanity_check() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1052, in _run_sanity_check val_loop.run() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/utilities.py", line 178, in _decorator return loop_run(self, *args, kwargs) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 135, in run self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 396, in _evaluation_step output = call._call_strategy_hook(trainer, hook_name, step_args) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook output = fn(args, kwargs) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 411, in validation_step return self.lightning_module.validation_step(*args, kwargs) File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 150, in validation_step loss_dict, attn_dict = self.get_losses(batch) File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 73, in get_losses dur_loss, prior_loss, diff_loss, attn = self( File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) File "/content/pflowtts_pytorch/pflow/models/pflow_tts.py", line 125, in forward mu_x, logw, x_mask = self.encoder(x, x_lengths, prompt_slice) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) File "/content/pflowtts_pytorch/pflow/models/components/speech_prompt_encoder.py", line 569, in forward x_emb = self.emb(x_input) math.sqrt(self.n_channels) RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/content/pflowtts_pytorch/pflow/utils/utils.py", line 76, in wrap metric_dict, object_dict = task_func(cfg=cfg) File "/content/pflowtts_pytorch/pflow/train.py", line 97, in train trainer.fit(model=model, datamodule=datamodule) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 538, in fit call._call_and_handle_interrupt( File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 68, in _call_and_handle_interrupt trainer._teardown() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1004, in _teardown self.strategy.teardown() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 535, in teardown self.lightning_module.cpu() File "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/device_dtype_mixin.py", line 82, in cpu return super().cpu() File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in cpu return self._apply(lambda t: t.cpu()) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply module._apply(fn) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply module._apply(fn) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 927, in _apply param_applied = fn(param) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in
return self._apply(lambda t: t.cpu())
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.[2024-10-30 12:07:47,697][pflow.utils.utils][INFO] - Output dir: /content/pflowtts_pytorch/logs/train/ljspeech/runs/2024-10-30_12-07-30 Error executing job with overrides: ['experiment=ljspeech'] Traceback (most recent call last): File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt return trainer_fn(*args, kwargs) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run results = self._run_stage() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1023, in _run_stage self._run_sanity_check() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1052, in _run_sanity_check val_loop.run() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/utilities.py", line 178, in _decorator return loop_run(self, *args, kwargs) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 135, in run self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/evaluation_loop.py", line 396, in _evaluation_step output = call._call_strategy_hook(trainer, hook_name, step_args) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook output = fn(args, kwargs) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 411, in validation_step return self.lightning_module.validation_step(*args, kwargs) File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 150, in validation_step loss_dict, attn_dict = self.get_losses(batch) File "/content/pflowtts_pytorch/pflow/models/baselightningmodule.py", line 73, in get_losses dur_loss, prior_loss, diff_loss, attn = self( File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) File "/content/pflowtts_pytorch/pflow/models/pflow_tts.py", line 125, in forward mu_x, logw, x_mask = self.encoder(x, x_lengths, prompt_slice) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) File "/content/pflowtts_pytorch/pflow/models/components/speech_prompt_encoder.py", line 569, in forward x_emb = self.emb(x_input) math.sqrt(self.n_channels) RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/content/pflowtts_pytorch/pflow/train.py", line 130, in main metricdict, = train(cfg) File "/content/pflowtts_pytorch/pflow/utils/utils.py", line 86, in wrap raise ex File "/content/pflowtts_pytorch/pflow/utils/utils.py", line 76, in wrap metric_dict, object_dict = task_func(cfg=cfg) File "/content/pflowtts_pytorch/pflow/train.py", line 97, in train trainer.fit(model=model, datamodule=datamodule) File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 538, in fit call._call_and_handle_interrupt( File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 68, in _call_and_handle_interrupt trainer._teardown() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1004, in _teardown self.strategy.teardown() File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py", line 535, in teardown self.lightning_module.cpu() File "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/device_dtype_mixin.py", line 82, in cpu return super().cpu() File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in cpu return self._apply(lambda t: t.cpu()) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply module._apply(fn) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 900, in _apply module._apply(fn) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 927, in _apply param_applied = fn(param) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1118, in
return self._apply(lambda t: t.cpu())
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace. terminate called after throwing an instance of 'c10::Error' what(): CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.Exception raised from c10_cuda_check_implementation at ../c10/cuda/CUDAException.cpp:43 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x78c8b68b9446 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so) frame #1: c10::detail::torchCheckFail(char const, char const, unsigned int, std::string const&) + 0x64 (0x78c8b68636e4 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so) frame #2: c10::cuda::c10_cuda_check_implementation(int, char const, char const, int, bool) + 0x118 (0x78c8b69a5a18 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so) frame #3: + 0x1021c88 (0x78c86c67fc88 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #4: + 0x102a735 (0x78c86c688735 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #5: + 0x5faf70 (0x78c8b579af70 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #6: + 0x6f69f (0x78c8b689a69f in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #7: c10::TensorImpl::~TensorImpl() + 0x21b (0x78c8b689337b in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #8: c10::TensorImpl::~TensorImpl() + 0x9 (0x78c8b6893529 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #9: + 0x8c1a98 (0x78c8b5a61a98 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #10: THPVariable_subclass_dealloc(_object*) + 0x2c6 (0x78c8b5a61de6 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)