To reproduce\
source /dccstor/mm_hcls/common/env.sh
python fuse_example/multimodality/ehr_transformer/main_train.py
In order to skip raw data loading and run faster:
limit number of batches (line 59 in main_train.py)
use pkl of the raw data (update config.yaml line 13)
raw_data_pkl: ${oc.env:CINC_DATA_PKL}
set num_workers=0 (config.yaml, lines 38, 35) - to see the errors as below
Expected behavior\
passing at least one epoch
Screenshots\
Epoch 0: 79%|██████████████████████████████████████▉ | 50/63 [01:44<00:27, 2.09s/it, loss=nan, v_num=1failed to process key data.sample_id | 0/13 [00:00<?, ?it/s]
Error executing job with overrides: []
Traceback (most recent call last):
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/clearml/binding/hydra_bind.py", line 170, in _patched_task_function
return task_function(a_config, a_args, a_kwargs)
File "main_train.py", line 264, in main
train(model=nn_model, dl_train=dl_train, dl_valid=dl_valid, cfg.train)
File "main_train.py", line 248, in train
pl_trainer.fit(pl_module, dl_train, dl_valid, ckpt_path=None)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit
self._call_and_handle_interrupt(
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt
return trainer_fn(args, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run
results = self._run_stage()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage
return self._run_train()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1283, in _run_train
self.fit_loop.run()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, *kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 271, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 201, in run
self.on_advance_end()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 241, in on_advance_end
self._run_validation()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 299, in _run_validation
self.val_loop.run()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(args, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 155, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(args, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 143, in advance
output = self._evaluation_step(kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 240, in _evaluation_step
output = self.trainer._call_strategy_hook(hook_name, kwargs.values())
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1704, in _call_strategy_hook
output = fn(*args, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/strategies/dp.py", line 139, in validation_step
return self.model(*args, *kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(input, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/_utils.py", line 461, in reraise
raise exception
IndexError: Caught IndexError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, *kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/overrides/data_parallel.py", line 65, in forward
output = super().forward(inputs, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 90, in forward
return self.module.validation_step(*inputs, *kwargs)
File "/u/ella/GitHub/fuse-med-ml/fuse/dl/lightning/pl_module.py", line 152, in validationstep
= step_losses(self._losses, batch_dict)
File "/u/ella/GitHub/fuse-med-ml/fuse/dl/lightning/pl_funcs.py", line 165, in step_losses
current_loss_result = loss_function(batch_dict)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(input, **kwargs)
File "/u/ella/GitHub/fuse-med-ml/fuse/dl/losses/loss_default.py", line 101, in forward
batch_dict = self.preprocess_func(batch_dict)
File "main_train.py", line 32, in filter_gender_label_unknown
batch_dict = batch_dict.indices(
File "/u/ella/GitHub/fuse-med-ml/fuse/utils/ndict.py", line 281, in indices
new_value = [item for i, item in enumerate(value) if indices[i]]
File "/u/ella/GitHub/fuse-med-ml/fuse/utils/ndict.py", line 281, in
new_value = [item for i, item in enumerate(value) if indices[i]]
IndexError: index 64 is out of bounds for dimension 0 with size 64
Epoch 0: 97%|████████████████terminate called after throwing an instance of 'c10::CUDAError'4.00it/s, loss=nan, v_num=1]
what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from insert_events at ../c10/cuda/CUDACachingAllocator.cpp:1423 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x148752e52612 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: + 0x22900 (0x1487530c1900 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10::cuda::CUDACachingAllocator::raw_delete(void) + 0x22d (0x1487530c4c4d in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #3: + 0x339668 (0x14879c6da668 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #4: c10::TensorImpl::release_resources() + 0x175 (0x148752e37295 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #5: + 0x214cfd (0x14879c5b5cfd in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #6: + 0x541188 (0x14879c8e2188 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #7: THPVariable_subclass_dealloc(_object) + 0x2b2 (0x14879c8e2482 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
Error executing job with overrides: []
Traceback (most recent call last):
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1163, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/multiprocessing/queues.py", line 108, in get
raise Empty
_queue.Empty
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/clearml/binding/hydra_bind.py", line 170, in _patched_task_function
return task_function(a_config, *a_args, **a_kwargs)
File "main_train.py", line 264, in main
train(model=nn_model, dl_train=dl_train, dl_valid=dl_valid, **cfg.train)
File "main_train.py", line 248, in train
pl_trainer.fit(pl_module, dl_train, dl_valid, ckpt_path=None)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit
self._call_and_handle_interrupt(
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run
results = self._run_stage()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage
return self._run_train()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1283, in _run_train
self.fit_loop.run()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, **kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 271, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 201, in run
self.on_advance_end()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 241, in on_advance_end
self._run_validation()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 299, in _run_validation
self.val_loop.run()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, **kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 155, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, **kwargs)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 127, in advance
batch = next(data_fetcher)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/utilities/fetching.py", line 184, in __next__
return self.fetching_function()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/utilities/fetching.py", line 263, in fetching_function
self._fetch_next_batch(self.dataloader_iter)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/utilities/fetching.py", line 277, in _fetch_next_batch
batch = next(iterator)
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in __next__
data = self._next_data()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1359, in _next_data
idx, data = self._get_data()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1325, in _get_data
success, data = self._try_get_data()
File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1176, in _try_get_data
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 3406969) exited unexpectedly
**Additional context**\
Add any other context about the problem here.
Describe the bug\ DataLoader failed when running on multiple(2) GPUs using the "dp" strategy
FuseMedML version\ commit b8935935a66e7dd7c01a897440f89ffff0f2ba2a (HEAD -> master, origin/master, origin/HEAD) Author: Moshiko Raboh 86309179+mosheraboh@users.noreply.github.com Date: Tue Mar 14 18:16:38 2023 +0200
Python version\ Python 3.8.13
To reproduce\ source /dccstor/mm_hcls/common/env.sh python fuse_example/multimodality/ehr_transformer/main_train.py
In order to skip raw data loading and run faster:
Expected behavior\ passing at least one epoch
Screenshots\ Epoch 0: 79%|██████████████████████████████████████▉ | 50/63 [01:44<00:27, 2.09s/it, loss=nan, v_num=1failed to process key data.sample_id | 0/13 [00:00<?, ?it/s] Error executing job with overrides: [] Traceback (most recent call last): File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/clearml/binding/hydra_bind.py", line 170, in _patched_task_function return task_function(a_config, a_args, a_kwargs) File "main_train.py", line 264, in main train(model=nn_model, dl_train=dl_train, dl_valid=dl_valid, cfg.train) File "main_train.py", line 248, in train pl_trainer.fit(pl_module, dl_train, dl_valid, ckpt_path=None) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit self._call_and_handle_interrupt( File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt return trainer_fn(args, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl results = self._run(model, ckpt_path=self.ckpt_path) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run results = self._run_stage() File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage return self._run_train() File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1283, in _run_train self.fit_loop.run() File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, *kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 271, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 201, in run self.on_advance_end() File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 241, in on_advance_end self._run_validation() File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 299, in _run_validation self.val_loop.run() File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(args, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 155, in advance dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(args, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 143, in advance output = self._evaluation_step(kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 240, in _evaluation_step output = self.trainer._call_strategy_hook(hook_name, kwargs.values()) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1704, in _call_strategy_hook output = fn(*args, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/strategies/dp.py", line 139, in validation_step return self.model(*args, *kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(input, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply output.reraise() File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/_utils.py", line 461, in reraise raise exception IndexError: Caught IndexError in replica 0 on device 0. Original Traceback (most recent call last): File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker output = module(*input, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, *kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/overrides/data_parallel.py", line 65, in forward output = super().forward(inputs, kwargs) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 90, in forward return self.module.validation_step(*inputs, *kwargs) File "/u/ella/GitHub/fuse-med-ml/fuse/dl/lightning/pl_module.py", line 152, in validationstep = step_losses(self._losses, batch_dict) File "/u/ella/GitHub/fuse-med-ml/fuse/dl/lightning/pl_funcs.py", line 165, in step_losses current_loss_result = loss_function(batch_dict) File "/u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(input, **kwargs) File "/u/ella/GitHub/fuse-med-ml/fuse/dl/losses/loss_default.py", line 101, in forward batch_dict = self.preprocess_func(batch_dict) File "main_train.py", line 32, in filter_gender_label_unknown batch_dict = batch_dict.indices( File "/u/ella/GitHub/fuse-med-ml/fuse/utils/ndict.py", line 281, in indices new_value = [item for i, item in enumerate(value) if indices[i]] File "/u/ella/GitHub/fuse-med-ml/fuse/utils/ndict.py", line 281, in
new_value = [item for i, item in enumerate(value) if indices[i]]
IndexError: index 64 is out of bounds for dimension 0 with size 64
Epoch 0: 97%|████████████████terminate called after throwing an instance of 'c10::CUDAError'4.00it/s, loss=nan, v_num=1] what(): CUDA error: initialization error CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Exception raised from insert_events at ../c10/cuda/CUDACachingAllocator.cpp:1423 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x148752e52612 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10.so) frame #1: + 0x22900 (0x1487530c1900 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10::cuda::CUDACachingAllocator::raw_delete(void) + 0x22d (0x1487530c4c4d in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #3: + 0x339668 (0x14879c6da668 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #4: c10::TensorImpl::release_resources() + 0x175 (0x148752e37295 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #5: + 0x214cfd (0x14879c5b5cfd in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #6: + 0x541188 (0x14879c8e2188 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #7: THPVariable_subclass_dealloc(_object ) + 0x2b2 (0x14879c8e2482 in /u/ella/.conda/miniconda3/envs/bio/lib/python3.8/site-packages/torch/lib/libtorch_python.so)