torch.distributed.DistBackendError when training on multiple GPUs

scvi crashes when trying to train on multiple GPUs (2x Tesla P100-PCIE-16GB)

As attempt to work around https://github.com/Lightning-AI/pytorch-lightning/issues/17212 issue strategy='ddp_find_unused_parameters_true' was set.

def annotate(adata: AnnData, geneset: Dict, out_dir: str = 'out', epochs: int = None, visualize: bool = False, save: bool = False, random_seed=None) -> pd.Series:
    normalized = adata.copy()
    sc.pp.normalize_total(normalized, target_sum=1e4)
    sc.pp.log1p(normalized)
    normalized = normalized[:, geneset['gene_subset']].copy()
    sc.pp.scale(normalized)
    adata.obs["seed_labels"] = generate_seed_labels(normalized, geneset['cell_geneset'])
    base_model_train_ratio, transfer_model_train_ratio = get_train_ratio(adata, unconstrained_train_ratio=0.9)
    torch.set_float32_matmul_precision("medium")
    scvi.model.SCVI.setup_anndata(adata, batch_key=None, labels_key="seed_labels")
    if random_seed is not None:
        scvi.settings.seed = random_seed
    scvi_model = scvi.model.SCVI(adata, n_latent=30, n_layers=2)
    scvi_model.train(max_epochs=epochs, train_size=base_model_train_ratio, accelerator='gpu', devices=-1, strategy='ddp_find_unused_parameters_true')

    scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, 'unknown')
    print('Training transfer model:')
    scanvi_model.train(max_epochs=epochs, train_size=transfer_model_train_ratio, accelerator='gpu', devices=-1, strategy='ddp_find_unused_parameters_true'
    )

Training transfer model:

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

Training:   0%|          | 0/100 [00:00<?, ?it/s]
Epoch 1/100:   0%|          | 0/100 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

Training:   0%|          | 0/100 [00:00<?, ?it/s]
Epoch 1/100:   0%|          | 0/100 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/scvi/module/_scanvae.py:304: UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -px.log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_scanvae.py:304: UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -px.log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_scanvae.py:304: UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -px.log_prob(x).sum(-1)

Epoch 1/100:   1%|          | 1/100 [00:15<26:13, 15.89s/it]/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:433: It is recommended to use `self.log('train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.

Epoch 1/100:   1%|          | 1/100 [00:18<30:16, 18.35s/it][rank1]:[W socket.cpp:432] [c10d] While waitForInput, poolFD failed with (errno: 0 - Success).
Traceback (most recent call last):
  File "/work/demo.py", line 8, in <module>
    run(args)
  File "/work/src/main.py", line 40, in run
    pred = annotate(
  File "/work/src/annotate.py", line 181, in annotate
    scanvi_model.train(
  File "/usr/local/lib/python3.10/dist-packages/scvi/model/_scanvi.py", line 438, in train
    return runner()
  File "/usr/local/lib/python3.10/dist-packages/scvi/train/_trainrunner.py", line 98, in __call__
    self.trainer.fit(self.training_plan, self.data_splitter)
  File "/usr/local/lib/python3.10/dist-packages/scvi/train/_trainer.py", line 219, in fit
    super().fit(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 544, in fit
    call._call_and_handle_interrupt(
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 580, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 989, in _run
    results = self._run_stage()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1035, in _run_stage
    self.fit_loop.run()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py", line 203, in run
    self.on_advance_end()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py", line 372, in on_advance_end
    call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=False)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py", line 208, in _call_callback_hooks
    fn(trainer, trainer.lightning_module, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/scvi/train/_progress.py", line 85, in on_train_epoch_end
    self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/progress/progress_bar.py", line 195, in get_metrics
    pbar_metrics = trainer.progress_bar_metrics
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py", line 1653, in progress_bar_metrics
    return self._logger_connector.progress_bar_metrics
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py", line 245, in progress_bar_metrics
    metrics = self.metrics["pbar"]
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py", line 226, in metrics
    return self.trainer._results.metrics(on_step)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 475, in metrics
    value = self._get_cache(result_metric, on_step)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 439, in _get_cache
    result_metric.compute()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 284, in wrapped_func
    self._computed = compute(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 252, in compute
    return self.value.compute()
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/metric.py", line 1135, in compute
    val_a = self.metric_a.compute() if isinstance(self.metric_a, Metric) else self.metric_a
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/metric.py", line 1135, in compute
    val_a = self.metric_a.compute() if isinstance(self.metric_a, Metric) else self.metric_a
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/metric.py", line 610, in wrapped_func
    with self.sync_context(
  File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__
    return next(self.gen)
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/metric.py", line 581, in sync_context
    self.sync(
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/metric.py", line 530, in sync
    self._sync_dist(dist_sync_fn, process_group=process_group)
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/metric.py", line 434, in _sync_dist
    output_dict = apply_to_collection(
  File "/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/apply_func.py", line 70, in apply_to_collection
    return {k: function(v, *args, **kwargs) for k, v in data.items()}
  File "/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/apply_func.py", line 70, in <dictcomp>
    return {k: function(v, *args, **kwargs) for k, v in data.items()}
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/utilities/distributed.py", line 122, in gather_all_tensors
    return _simple_gather_all_tensors(result, group, world_size)
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/utilities/distributed.py", line 93, in _simple_gather_all_tensors
    torch.distributed.all_gather(gathered_result, result, group)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 72, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2617, in all_gather
    work = group.allgather([tensor_list], [tensor])
torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1', but store->get('1') got error: Socket Timeout
Exception raised from doWait at ../torch/csrc/distributed/c10d/TCPStore.cpp:550 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab06181d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x15c0e57 (0x7faaee93fe57 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #2: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7faaf2c0ece2 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #3: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7faaf2c0fb11 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faaf2bc4f81 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faaf2bc4f81 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faaf2bc4f81 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #7: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7faabbe02c69 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #8: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, std::vector<c10::Device, std::allocator<c10::Device> > const&, c10d::OpType, int, bool) + 0x22b (0x7faabbe09c5b in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #9: c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&) + 0xb5c (0x7faabbe2005c in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #10: <unknown function> + 0x583a31d (0x7faaf2bb931d in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0x5844218 (0x7faaf2bc3218 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #12: <unknown function> + 0x4e893cc (0x7faaf22083cc in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #13: <unknown function> + 0x1a08a88 (0x7faaeed87a88 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x584ba33 (0x7faaf2bcaa33 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #15: <unknown function> + 0x5856e1f (0x7faaf2bd5e1f in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #16: <unknown function> + 0xca3fae (0x7fab0545afae in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #17: <unknown function> + 0x413ea4 (0x7fab04bcaea4 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #18: <unknown function> + 0x15a10e (0x59d15614610e in /usr/bin/python3)
frame #19: _PyObject_MakeTpCall + 0x25b (0x59d15613ca7b in /usr/bin/python3)
frame #20: <unknown function> + 0x168acb (0x59d156154acb in /usr/bin/python3)
frame #21: _PyEval_EvalFrameDefault + 0x614a (0x59d156134cfa in /usr/bin/python3)
frame #22: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #23: _PyEval_EvalFrameDefault + 0x2a27 (0x59d1561315d7 in /usr/bin/python3)
frame #24: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #25: _PyEval_EvalFrameDefault + 0x614a (0x59d156134cfa in /usr/bin/python3)
frame #26: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #27: _PyEval_EvalFrameDefault + 0x6bd (0x59d15612f26d in /usr/bin/python3)
frame #28: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #29: PyObject_Call + 0x122 (0x59d156155492 in /usr/bin/python3)
frame #30: _PyEval_EvalFrameDefault + 0x2a27 (0x59d1561315d7 in /usr/bin/python3)
frame #31: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #32: _PyEval_EvalFrameDefault + 0x6bd (0x59d15612f26d in /usr/bin/python3)
frame #33: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #34: _PyEval_EvalFrameDefault + 0x198c (0x59d15613053c in /usr/bin/python3)
frame #35: <unknown function> + 0x1687f1 (0x59d1561547f1 in /usr/bin/python3)
frame #36: _PyEval_EvalFrameDefault + 0x198c (0x59d15613053c in /usr/bin/python3)
frame #37: <unknown function> + 0x1687f1 (0x59d1561547f1 in /usr/bin/python3)
frame #38: _PyEval_EvalFrameDefault + 0x198c (0x59d15613053c in /usr/bin/python3)
frame #39: <unknown function> + 0x200175 (0x59d1561ec175 in /usr/bin/python3)
frame #40: <unknown function> + 0x15ac59 (0x59d156146c59 in /usr/bin/python3)
frame #41: _PyEval_EvalFrameDefault + 0x6bd (0x59d15612f26d in /usr/bin/python3)
frame #42: <unknown function> + 0x168a51 (0x59d156154a51 in /usr/bin/python3)
frame #43: _PyEval_EvalFrameDefault + 0x266d (0x59d15613121d in /usr/bin/python3)
frame #44: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #45: _PyEval_EvalFrameDefault + 0x614a (0x59d156134cfa in /usr/bin/python3)
frame #46: <unknown function> + 0x1687f1 (0x59d1561547f1 in /usr/bin/python3)
frame #47: _PyEval_EvalFrameDefault + 0x614a (0x59d156134cfa in /usr/bin/python3)
frame #48: <unknown function> + 0x1687f1 (0x59d1561547f1 in /usr/bin/python3)
frame #49: _PyEval_EvalFrameDefault + 0x614a (0x59d156134cfa in /usr/bin/python3)
frame #50: <unknown function> + 0x168a51 (0x59d156154a51 in /usr/bin/python3)
frame #51: _PyEval_EvalFrameDefault + 0x2a27 (0x59d1561315d7 in /usr/bin/python3)
frame #52: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #53: _PyEval_EvalFrameDefault + 0x614a (0x59d156134cfa in /usr/bin/python3)
frame #54: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #55: _PyEval_EvalFrameDefault + 0x614a (0x59d156134cfa in /usr/bin/python3)
frame #56: _PyFunction_Vectorcall + 0x7c (0x59d1561469fc in /usr/bin/python3)
frame #57: _PyEval_EvalFrameDefault + 0x8ac (0x59d15612f45c in /usr/bin/python3)
frame #58: <unknown function> + 0x16600e (0x59d15615200e in /usr/bin/python3)
frame #59: _PyObject_GenericGetAttrWithDict + 0x468 (0x59d1561448a8 in /usr/bin/python3)
frame #60: PyObject_GetAttr + 0x4d (0x59d156142e3d in /usr/bin/python3)
frame #61: _PyEval_EvalFrameDefault + 0x5dc1 (0x59d156134971 in /usr/bin/python3)
frame #62: <unknown function> + 0x16600e (0x59d15615200e in /usr/bin/python3)
frame #63: _PyObject_GenericGetAttrWithDict + 0x468 (0x59d1561448a8 in /usr/bin/python3)
. This may indicate a possible application crash on rank 0 or a network set up issue.

Epoch 1/100:   1%|          | 1/100 [30:19<50:01:35, 1819.15s/it]
[rank0]:[E ProcessGroupNCCL.cpp:523] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=115017, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=1800000) ran for 1800859 milliseconds before timing out.
[rank0]:[E ProcessGroupNCCL.cpp:537] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank0]:[E ProcessGroupNCCL.cpp:543] To avoid data inconsistency, we are taking the entire process down.
[rank0]:[E ProcessGroupNCCL.cpp:1182] [Rank 0] NCCL watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=115017, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=1800000) ran for 1800859 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:525 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7d37dfb81d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x1e6 (0x7d37958026e6 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x19d (0x7d3795805c3d in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x119 (0x7d3795806839 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xdc253 (0x7d38071b9253 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #5: <unknown function> + 0x94ac3 (0x7d381d459ac3 in /lib/x86_64-linux-gnu/libc.so.6)
frame #6: <unknown function> + 0x126a40 (0x7d381d4eba40 in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::DistBackendError'
  what():  [Rank 0] NCCL watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=115017, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=1800000) ran for 1800859 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:525 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7d37dfb81d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x1e6 (0x7d37958026e6 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x19d (0x7d3795805c3d in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x119 (0x7d3795806839 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xdc253 (0x7d38071b9253 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #5: <unknown function> + 0x94ac3 (0x7d381d459ac3 in /lib/x86_64-linux-gnu/libc.so.6)
frame #6: <unknown function> + 0x126a40 (0x7d381d4eba40 in /lib/x86_64-linux-gnu/libc.so.6)

Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1186 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7d37dfb81d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xdf6b11 (0x7d379555cb11 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0xdc253 (0x7d38071b9253 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #3: <unknown function> + 0x94ac3 (0x7d381d459ac3 in /lib/x86_64-linux-gnu/libc.so.6)
frame #4: <unknown function> + 0x126a40 (0x7d381d4eba40 in /lib/x86_64-linux-gnu/libc.so.6)

Aborted (core dumped)

Versions:

cuda 11.8
scvi-tools 1.1.1
jaxlib 0.4.23+cuda11.cudnn86

Here is the log with enable_progress_bar=False:

2024-06-13 09:03:53.504676: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-13 09:03:53.509353: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-13 09:03:53.510154: I external/xla/xla/service/[service.cc:145](http://service.cc:145/)] XLA service 0x5a219133c0d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-06-13 09:03:53.510223: I external/xla/xla/service/[service.cc:153](http://service.cc:153/)]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
2024-06-13 09:03:53.510250: I external/xla/xla/service/[service.cc:153](http://service.cc:153/)]   StreamExecutor device (1): Tesla P100-PCIE-16GB, Compute Capability 6.0
2024-06-13 09:03:53.516528: I external/xla/xla/pjrt/gpu/se_gpu_pjrt_[client.cc:738](http://client.cc:738/)] Using BFC allocator.
2024-06-13 09:03:53.516599: I external/xla/xla/pjrt/gpu/gpu_[helpers.cc:105](http://helpers.cc:105/)] XLA backend allocating 12794658816 bytes on device 0 for BFCAllocator.
2024-06-13 09:03:53.516649: I external/xla/xla/pjrt/gpu/gpu_[helpers.cc:105](http://helpers.cc:105/)] XLA backend allocating 12794658816 bytes on device 1 for BFCAllocator.
2024-06-13 09:03:53.516838: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-13 09:03:53.517388: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
/usr/local/lib/python3.10/dist-packages/scvi/data/fields/_base_[field.py:64](http://field.py:64/): UserWarning: adata.X does not contain unnormalized count data. Are you sure this is what you want?
  self.validate_field(adata)

Training base model:

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
2024-06-13 09:08:55.251742: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-13 09:08:55.256383: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-13 09:08:55.257143: I external/xla/xla/service/[service.cc:145](http://service.cc:145/)] XLA service 0x59f238331a40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-06-13 09:08:55.257186: I external/xla/xla/service/[service.cc:153](http://service.cc:153/)]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
2024-06-13 09:08:55.257199: I external/xla/xla/service/[service.cc:153](http://service.cc:153/)]   StreamExecutor device (1): Tesla P100-PCIE-16GB, Compute Capability 6.0
2024-06-13 09:08:55.258777: I external/xla/xla/pjrt/gpu/se_gpu_pjrt_[client.cc:738](http://client.cc:738/)] Using BFC allocator.
2024-06-13 09:08:55.258847: I external/xla/xla/pjrt/gpu/gpu_[helpers.cc:105](http://helpers.cc:105/)] XLA backend allocating 12794658816 bytes on device 0 for BFCAllocator.
2024-06-13 09:08:55.258973: I external/xla/xla/pjrt/gpu/gpu_[helpers.cc:105](http://helpers.cc:105/)] XLA backend allocating 12794658816 bytes on device 1 for BFCAllocator.
2024-06-13 09:08:55.259192: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-13 09:08:55.259678: I external/xla/xla/stream_executor/cuda/cuda_[executor.cc:991](http://executor.cc:991/)] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
/usr/local/lib/python3.10/dist-packages/scvi/data/fields/_base_[field.py:64](http://field.py:64/): UserWarning: adata.X does not contain unnormalized count data. Are you sure this is what you want?
  self.validate_field(adata)

Training base model:

Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_[connector.py:441](http://connector.py:441/): The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.

Training:   0%|          | 0/100 [00:00<?, ?it/s]
Epoch 1/100:   0%|          | 0/100 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/distributions/_negative_[binomial.py:69](http://binomial.py:69/): UserWarning: Specified kernel cache directory could not be created! This disables kernel caching. Specified directory is /root/.cache/torch/kernels. This warning will appear only once per process. (Triggered internally at ../aten/src/ATen/native/cuda/jit_utils.cpp:1442.)
  + torch.lgamma(x + theta)
/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)

Epoch 1/100:   1%|          | 1/100 [00:07<11:58,  7.26s/it]
Epoch 1/100:   1%|          | 1/100 [00:07<11:58,  7.26s/it, v_num=1, train_loss_step=5.37e+3, train_loss_epoch=5.84e+3]
Epoch 2/100:   1%|          | 1/100 [00:07<11:58,  7.26s/it, v_num=1, train_loss_step=5.37e+3, train_loss_epoch=5.84e+3]/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)

Epoch 2/100:   2%|▏         | 2/100 [00:12<09:29,  5.81s/it, v_num=1, train_loss_step=5.37e+3, train_loss_epoch=5.84e+3]

[...]

Epoch 98/100:  98%|█████████▊| 98/100 [07:59<00:09,  4.88s/it, v_num=1, train_loss_step=4.48e+3, train_loss_epoch=4.41e+3]
Epoch 98/100:  98%|█████████▊| 98/100 [07:59<00:09,  4.88s/it, v_num=1, train_loss_step=4.03e+3, train_loss_epoch=4.4e+3] 
Epoch 99/100:  98%|█████████▊| 98/100 [07:59<00:09,  4.88s/it, v_num=1, train_loss_step=4.03e+3, train_loss_epoch=4.4e+3]/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)

Epoch 99/100:  99%|█████████▉| 99/100 [08:04<00:04,  4.89s/it, v_num=1, train_loss_step=4.03e+3, train_loss_epoch=4.4e+3]
Epoch 99/100:  99%|█████████▉| 99/100 [08:04<00:04,  4.89s/it, v_num=1, train_loss_step=4.33e+3, train_loss_epoch=4.38e+3]
Epoch 100/100:  99%|█████████▉| 99/100 [08:04<00:04,  4.89s/it, v_num=1, train_loss_step=4.33e+3, train_loss_epoch=4.38e+3]/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_[vae.py:458](http://vae.py:458/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -generative_outputs["px"].log_prob(x).sum(-1)

Epoch 100/100: 100%|██████████| 100/100 [08:09<00:00,  4.87s/it, v_num=1, train_loss_step=4.33e+3, train_loss_epoch=4.38e+3]
Epoch 100/100: 100%|██████████| 100/100 [08:09<00:00,  4.87s/it, v_num=1, train_loss_step=4.2e+3, train_loss_epoch=4.39e+3] `[Trainer.fit](http://trainer.fit/)` stopped: `max_epochs=100` reached.

Epoch 100/100: 100%|██████████| 100/100 [08:09<00:00,  4.89s/it, v_num=1, train_loss_step=4.2e+3, train_loss_epoch=4.39e+3]
/usr/local/lib/python3.10/dist-packages/scvi/data/fields/_base_[field.py:64](http://field.py:64/): UserWarning: adata.X does not contain unnormalized count data. Are you sure this is what you want?
  self.validate_field(adata)
/usr/local/lib/python3.10/dist-packages/scvi/data/fields/_base_[field.py:64](http://field.py:64/): UserWarning: adata.X does not contain unnormalized count data. Are you sure this is what you want?
  self.validate_field(adata)

Training transfer model:

Training transfer model:

[34mINFO    [0m Training for [1;36m100[0m epochs.                                               
[34mINFO    [0m Training for [1;36m100[0m epochs.                                               
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_[connector.py:441](http://connector.py:441/): The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/scvi/module/_[scanvae.py:304](http://scanvae.py:304/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -px.log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_[scanvae.py:304](http://scanvae.py:304/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -px.log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/scvi/module/_[scanvae.py:304](http://scanvae.py:304/): UserWarning: The value argument must be within the support of the distribution
  reconst_loss = -px.log_prob(x).sum(-1)
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/[result.py:433](http://result.py:433/): It is recommended to use `self.log('train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[rank0]:[E ProcessGroupNCCL.cpp:523] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=115017, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=1800000) ran for 1800068 milliseconds before timing out.
[rank0]:[E ProcessGroupNCCL.cpp:537] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank0]:[E ProcessGroupNCCL.cpp:543] To avoid data inconsistency, we are taking the entire process down.
[rank0]:[E ProcessGroupNCCL.cpp:1182] [Rank 0] NCCL watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=115017, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=1800000) ran for 1800068 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:525 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7b7d508f4d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/[libc10.so](http://libc10.so/))
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x1e6 (0x7b7d024026e6 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #2: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x19d (0x7b7d02405c3d in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x119 (0x7b7d02406839 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #4: <unknown function> + 0xdc253 (0x7b7dafd48253 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #5: <unknown function> + 0x94ac3 (0x7b7e01ffdac3 in /lib/x86_64-linux-gnu/[libc.so](http://libc.so/).6)
frame #6: <unknown function> + 0x126a40 (0x7b7e0208fa40 in /lib/x86_64-linux-gnu/[libc.so](http://libc.so/).6)

terminate called after throwing an instance of 'c10::DistBackendError'
  what():  [Rank 0] NCCL watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=115017, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=1800000) ran for 1800068 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:525 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7b7d508f4d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/[libc10.so](http://libc10.so/))
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x1e6 (0x7b7d024026e6 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #2: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x19d (0x7b7d02405c3d in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x119 (0x7b7d02406839 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #4: <unknown function> + 0xdc253 (0x7b7dafd48253 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #5: <unknown function> + 0x94ac3 (0x7b7e01ffdac3 in /lib/x86_64-linux-gnu/[libc.so](http://libc.so/).6)
frame #6: <unknown function> + 0x126a40 (0x7b7e0208fa40 in /lib/x86_64-linux-gnu/[libc.so](http://libc.so/).6)

Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1186 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7b7d508f4d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/[libc10.so](http://libc10.so/))
frame #1: <unknown function> + 0xdf6b11 (0x7b7d0215cb11 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #2: <unknown function> + 0xdc253 (0x7b7dafd48253 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #3: <unknown function> + 0x94ac3 (0x7b7e01ffdac3 in /lib/x86_64-linux-gnu/[libc.so](http://libc.so/).6)
frame #4: <unknown function> + 0x126a40 (0x7b7e0208fa40 in /lib/x86_64-linux-gnu/[libc.so](http://libc.so/).6)

[rank1]:[W socket.cpp:432] [c10d] While waitForInput, poolFD failed with (errno: 0 - Success).
Traceback (most recent call last):
  File "/work/[demo.py](http://demo.py/)", line 8, in <module>
    run(args)
  File "/work/src/[main.py](http://main.py/)", line 40, in run
    pred = annotate(
  File "/work/src/[annotate.py](http://annotate.py/)", line 181, in annotate
    scanvi_model.train(
  File "/usr/local/lib/python3.10/dist-packages/scvi/model/_[scanvi.py](http://scanvi.py/)", line 438, in train
    return runner()
  File "/usr/local/lib/python3.10/dist-packages/scvi/train/_[trainrunner.py](http://trainrunner.py/)", line 98, in __call__
    [self.trainer.fit](http://self.trainer.fit/)([self.training](http://self.training/)_plan, [self.data](http://self.data/)_splitter)
  File "/usr/local/lib/python3.10/dist-packages/scvi/train/_[trainer.py](http://trainer.py/)", line 219, in fit
    super().fit(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/[trainer.py](http://trainer.py/)", line 544, in fit
    call._call_and_handle_interrupt(
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/[call.py](http://call.py/)", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/[trainer.py](http://trainer.py/)", line 580, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/[trainer.py](http://trainer.py/)", line 989, in _run
    results = self._run_stage()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/[trainer.py](http://trainer.py/)", line 1035, in _run_stage
    [self.fit](http://self.fit/)_[loop.run](http://loop.run/)()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_[loop.py](http://loop.py/)", line 203, in run
    self.on_advance_end()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_[loop.py](http://loop.py/)", line 376, in on_advance_end
    trainer._logger_connector.on_epoch_end()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_[connector.py](http://connector.py/)", line 187, in on_epoch_end
    metrics = self.metrics
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_[connector.py](http://connector.py/)", line 226, in metrics
    return self.trainer._results.metrics(on_step)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/[result.py](http://result.py/)", line 475, in metrics
    value = self._get_cache(result_metric, on_step)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/[result.py](http://result.py/)", line 439, in _get_cache
    result_metric.compute()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/[result.py](http://result.py/)", line 284, in wrapped_func
    self._computed = compute(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/logger_connector/[result.py](http://result.py/)", line 252, in compute
    return self.value.compute()
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/[metric.py](http://metric.py/)", line 1135, in compute
    val_a = self.metric_a.compute() if isinstance(self.metric_a, Metric) else self.metric_a
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/[metric.py](http://metric.py/)", line 1135, in compute
    val_a = self.metric_a.compute() if isinstance(self.metric_a, Metric) else self.metric_a
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/[metric.py](http://metric.py/)", line 610, in wrapped_func
    with self.sync_context(
  File "/usr/lib/python3.10/[contextlib.py](http://contextlib.py/)", line 135, in __enter__
    return next(self.gen)
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/[metric.py](http://metric.py/)", line 581, in sync_context
    self.sync(
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/[metric.py](http://metric.py/)", line 530, in sync
    self._sync_dist(dist_sync_fn, process_group=process_group)
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/[metric.py](http://metric.py/)", line 434, in _sync_dist
    output_dict = apply_to_collection(
  File "/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/apply_[func.py](http://func.py/)", line 70, in apply_to_collection
    return {k: function(v, *args, **kwargs) for k, v in data.items()}
  File "/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/apply_[func.py](http://func.py/)", line 70, in <dictcomp>
    return {k: function(v, *args, **kwargs) for k, v in data.items()}
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/utilities/[distributed.py](http://distributed.py/)", line 122, in gather_all_tensors
    return _simple_gather_all_tensors(result, group, world_size)
  File "/usr/local/lib/python3.10/dist-packages/torchmetrics/utilities/[distributed.py](http://distributed.py/)", line 93, in _simple_gather_all_tensors
    torch.distributed.all_gather(gathered_result, result, group)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_[logger.py](http://logger.py/)", line 72, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_[c10d.py](http://c10d.py/)", line 2617, in all_gather
    work = group.allgather([tensor_list], [tensor])
torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1', but store->get('1') got error: Connection reset by peer
Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:667 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x787d33d50d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/[libc10.so](http://libc10.so/))
frame #1: <unknown function> + 0x589525c (0x787d2041425c in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #2: c10d::TCPStore::doWait(c10::ArrayRef<std::string>, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x2ee (0x787d2040e92e in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x787d2040ece2 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x787d2040fb11 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x787d203c4f81 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x787d203c4f81 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x787d203c4f81 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #8: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x787ce9602c69 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #9: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, std::vector<c10::Device, std::allocator<c10::Device> > const&, c10d::OpType, int, bool) + 0x22b (0x787ce9609c5b in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #10: c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&) + 0xb5c (0x787ce962005c in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cuda.so](http://cuda.so/))
frame #11: <unknown function> + 0x583a31d (0x787d203b931d in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #12: <unknown function> + 0x5844218 (0x787d203c3218 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #13: <unknown function> + 0x4e893cc (0x787d1fa083cc in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #14: <unknown function> + 0x1a08a88 (0x787d1c587a88 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #15: <unknown function> + 0x584ba33 (0x787d203caa33 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #16: <unknown function> + 0x5856e1f (0x787d203d5e1f in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[cpu.so](http://cpu.so/))
frame #17: <unknown function> + 0xca3fae (0x787d32c5afae in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[python.so](http://python.so/))
frame #18: <unknown function> + 0x413ea4 (0x787d323caea4 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_[python.so](http://python.so/))
frame #19: <unknown function> + 0x15a10e (0x59f22582b10e in /usr/bin/python3)
frame #20: _PyObject_MakeTpCall + 0x25b (0x59f225821a7b in /usr/bin/python3)
frame #21: <unknown function> + 0x168acb (0x59f225839acb in /usr/bin/python3)
frame #22: _PyEval_EvalFrameDefault + 0x614a (0x59f225819cfa in /usr/bin/python3)
frame #23: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #24: _PyEval_EvalFrameDefault + 0x2a27 (0x59f2258165d7 in /usr/bin/python3)
frame #25: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #26: _PyEval_EvalFrameDefault + 0x614a (0x59f225819cfa in /usr/bin/python3)
frame #27: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #28: _PyEval_EvalFrameDefault + 0x6bd (0x59f22581426d in /usr/bin/python3)
frame #29: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #30: PyObject_Call + 0x122 (0x59f22583a492 in /usr/bin/python3)
frame #31: _PyEval_EvalFrameDefault + 0x2a27 (0x59f2258165d7 in /usr/bin/python3)
frame #32: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #33: _PyEval_EvalFrameDefault + 0x6bd (0x59f22581426d in /usr/bin/python3)
frame #34: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #35: _PyEval_EvalFrameDefault + 0x198c (0x59f22581553c in /usr/bin/python3)
frame #36: <unknown function> + 0x1687f1 (0x59f2258397f1 in /usr/bin/python3)
frame #37: _PyEval_EvalFrameDefault + 0x198c (0x59f22581553c in /usr/bin/python3)
frame #38: <unknown function> + 0x1687f1 (0x59f2258397f1 in /usr/bin/python3)
frame #39: _PyEval_EvalFrameDefault + 0x198c (0x59f22581553c in /usr/bin/python3)
frame #40: <unknown function> + 0x200175 (0x59f2258d1175 in /usr/bin/python3)
frame #41: <unknown function> + 0x15ac59 (0x59f22582bc59 in /usr/bin/python3)
frame #42: _PyEval_EvalFrameDefault + 0x6bd (0x59f22581426d in /usr/bin/python3)
frame #43: <unknown function> + 0x168a51 (0x59f225839a51 in /usr/bin/python3)
frame #44: _PyEval_EvalFrameDefault + 0x266d (0x59f22581621d in /usr/bin/python3)
frame #45: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #46: _PyEval_EvalFrameDefault + 0x614a (0x59f225819cfa in /usr/bin/python3)
frame #47: <unknown function> + 0x1687f1 (0x59f2258397f1 in /usr/bin/python3)
frame #48: _PyEval_EvalFrameDefault + 0x614a (0x59f225819cfa in /usr/bin/python3)
frame #49: <unknown function> + 0x1687f1 (0x59f2258397f1 in /usr/bin/python3)
frame #50: _PyEval_EvalFrameDefault + 0x614a (0x59f225819cfa in /usr/bin/python3)
frame #51: <unknown function> + 0x168a51 (0x59f225839a51 in /usr/bin/python3)
frame #52: _PyEval_EvalFrameDefault + 0x2a27 (0x59f2258165d7 in /usr/bin/python3)
frame #53: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #54: _PyEval_EvalFrameDefault + 0x614a (0x59f225819cfa in /usr/bin/python3)
frame #55: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #56: _PyEval_EvalFrameDefault + 0x614a (0x59f225819cfa in /usr/bin/python3)
frame #57: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
frame #58: _PyEval_EvalFrameDefault + 0x8ac (0x59f22581445c in /usr/bin/python3)
frame #59: <unknown function> + 0x16600e (0x59f22583700e in /usr/bin/python3)
frame #60: _PyObject_GenericGetAttrWithDict + 0x468 (0x59f2258298a8 in /usr/bin/python3)
frame #61: PyObject_GetAttr + 0x4d (0x59f225827e3d in /usr/bin/python3)
frame #62: _PyEval_EvalFrameDefault + 0x5dc1 (0x59f225819971 in /usr/bin/python3)
frame #63: _PyFunction_Vectorcall + 0x7c (0x59f22582b9fc in /usr/bin/python3)
. This may indicate a possible application crash on rank 0 or a network set up issue.
Aborted (core dumped)

scverse / scvi-tools

torch.distributed.DistBackendError when training on multiple GPUs #2635

Versions: