dlrm_main/0 [0]: warnings.warn(
dlrm_main/0 [0]:
dlrm_main/0 [0]:Epoch 0: 0it [00:00, ?it/s]dlrm_main/0 [0]:
dlrm_main/0 [0]:Epoch 0: 0it [00:00, ?it/s]
dlrm_main/0 [0]:Traceback (most recent call last):
dlrm_main/0 [0]: File "/home/ubuntu/code/DLRM/dlrm/torchrec_dlrm/dlrm_main.py", line 550, in <module>
dlrm_main/0 [0]: main(sys.argv[1:])
dlrm_main/0 [0]: File "/home/ubuntu/code/DLRM/dlrm/torchrec_dlrm/dlrm_main.py", line 544, in main
dlrm_main/0 [0]: train_val_test(
dlrm_main/0 [0]: File "/home/ubuntu/code/DLRM/dlrm/torchrec_dlrm/dlrm_main.py", line 405, in train_val_test
dlrm_main/0 [0]: _train(
dlrm_main/0 [0]: File "/home/ubuntu/code/DLRM/dlrm/torchrec_dlrm/dlrm_main.py", line 344, in _train
dlrm_main/0 [0]: train_pipeline.progress(combined_iterator)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/train_pipeline.py", line 499, in progress
dlrm_main/0 [0]: self._connect(dataloader_iter)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/train_pipeline.py", line 475, in _connect
dlrm_main/0 [0]: model(self._batch_i)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 248, in forward
dlrm_main/0 [0]: return self._dmp_wrapped_module(*args, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1007, in forward
dlrm_main/0 [0]: output = self._run_ddp_forward(*inputs, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 968, in _run_ddp_forward
dlrm_main/0 [0]: return module_to_run(*inputs[0], **kwargs[0])
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/code/DLRM/dlrm/torchrec_dlrm/modules/dlrm_train.py", line 79, in forward
dlrm_main/0 [0]: logits = self.model(batch.dense_features, batch.sparse_features)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/models/dlrm.py", line 398, in forward
dlrm_main/0 [0]: embedded_sparse = self.sparse_arch(sparse_features)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/models/dlrm.py", line 97, in forward
dlrm_main/0 [0]: sparse_features: KeyedTensor = self.embedding_bag_collection(features)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/types.py", line 422, in forward
dlrm_main/0 [0]: return self.compute_and_output_dist(ctx, dist_input)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/embeddingbag.py", line 392, in compute_and_output_dist
dlrm_main/0 [0]: awaitables=[
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/embeddingbag.py", line 393, in <listcomp>
dlrm_main/0 [0]: dist(lookup(features))
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/embedding_lookup.py", line 312, in forward
dlrm_main/0 [0]: embeddings.append(emb_op(features))
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1129, in _call_impl
dlrm_main/0 [0]: return forward_call(*input, **kwargs)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/distributed/batched_embedding_kernel.py", line 552, in forward
dlrm_main/0 [0]: offsets=features.offsets().long(),
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/sparse/jagged_tensor.py", line 797, in offsets
dlrm_main/0 [0]: _offsets = _maybe_compute_offsets(self._lengths, self._offsets)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/sparse/jagged_tensor.py", line 57, in _maybe_compute_offsets
dlrm_main/0 [0]: offsets = _to_offsets(lengths)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torchrec/sparse/jagged_tensor.py", line 36, in _to_offsets
dlrm_main/0 [0]: return torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
dlrm_main/0 [0]: File "/home/ubuntu/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/_ops.py", line 142, in __call__
dlrm_main/0 [0]: return self._op(*args, **kwargs or {})
dlrm_main/0 [0]:NotImplementedError: Could not run 'fbgemm::asynchronous_complete_cumsum' with arguments from the 'CUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'fbgemm::asynchronous_complete_cumsum' is only available for these backends: [Dense, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, SparseCPU, SparseCUDA, SparseHIP, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, SparseXPU, SparseVE, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID].
My environment has torch version 1.12.0.dev20220420 and CUDA version 11.3. OS is Ubuntu 18.04. I'm not a Facebook employee so the link does not work. I also wonder why there are so many "UNKNOWN_TENSOR_TYPE_ID" in the error message?
I was running
torchx run -s local_cwd dist.ddp -j 1x1 --script dlrm_main.py
as a test (from https://github.com/facebookresearch/dlrm/tree/main/torchrec_dlrm) and I saw this errorMy environment has torch version 1.12.0.dev20220420 and CUDA version 11.3. OS is Ubuntu 18.04. I'm not a Facebook employee so the link does not work. I also wonder why there are so many "UNKNOWN_TENSOR_TYPE_ID" in the error message?