I am trying to re-run benchmark results/v0.5.0/nvidia/submission/code/object_detection/pytorch/ and getting cuda error. Anyone with hints ? Thanks.
Machine: Intel Xeon with 8 x V100-SXM2
OS: CentOS Linux release 7.5.1804 (Core)
nVidia driver 410.79
Cuda 10.0
nvidia-docker2 2.0.3-1.docker18.06.1.ce
Commands has been run:
cd /bfs/hpc_cluster/work/mlperf/src/results/v0.5.0/nvidia/submission/code/object_detection/pytorch
DATADIR=/bfs/hpc_cluster/work/mlperf/src/results/v0.5.0/nvidia/submission/code/object_detection/detectron/lib/datasets/data/coco LOGDIR=/bfs/hpc_cluster/work/mlperf/src/results/v0.5.0/nvidia/submission/code/object_detection/logs DGXSYSTEM=DGX2 ./run.sub
Log:
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
Traceback (most recent call last):
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
File "tools/train_net.py", line 328, in
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
File "tools/train_net.py", line 328, in
File "tools/train_net.py", line 328, in
main()
File "tools/train_net.py", line 239, in main
main()
torch.cuda.set_device(args.local_rank)
main()
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
File "tools/train_net.py", line 239, in main
File "tools/train_net.py", line 239, in main
main()
File "tools/train_net.py", line 239, in main
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
torch._C._cuda_setDevice(device)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
main()
File "tools/train_net.py", line 239, in main
main()
File "tools/train_net.py", line 239, in main
torch._C._cuda_setDevice(device)
main()
torch._C._cuda_setDevice(device)
File "tools/train_net.py", line 239, in main
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch._C._cuda_setDevice(device)
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
main()
File "tools/train_net.py", line 239, in main
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
Hello,
I am trying to re-run benchmark results/v0.5.0/nvidia/submission/code/object_detection/pytorch/ and getting cuda error. Anyone with hints ? Thanks.
Machine: Intel Xeon with 8 x V100-SXM2 OS: CentOS Linux release 7.5.1804 (Core) nVidia driver 410.79 Cuda 10.0 nvidia-docker2 2.0.3-1.docker18.06.1.ce
Commands has been run: cd /bfs/hpc_cluster/work/mlperf/src/results/v0.5.0/nvidia/submission/code/object_detection/pytorch DATADIR=/bfs/hpc_cluster/work/mlperf/src/results/v0.5.0/nvidia/submission/code/object_detection/detectron/lib/datasets/data/coco LOGDIR=/bfs/hpc_cluster/work/mlperf/src/results/v0.5.0/nvidia/submission/code/object_detection/logs DGXSYSTEM=DGX2 ./run.sub
Log: THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal THCudaCheck FAIL file=torch/csrc/cuda/Module.cpp line=34 error=10 : invalid device ordinal Traceback (most recent call last): File "tools/train_net.py", line 328, in
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
Traceback (most recent call last):
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
File "tools/train_net.py", line 328, in
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
File "tools/train_net.py", line 328, in
File "tools/train_net.py", line 328, in
main()
File "tools/train_net.py", line 239, in main
main()
torch.cuda.set_device(args.local_rank)
main()
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
File "tools/train_net.py", line 239, in main
File "tools/train_net.py", line 239, in main
main()
File "tools/train_net.py", line 239, in main
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
torch._C._cuda_setDevice(device)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
main()
File "tools/train_net.py", line 239, in main
main()
File "tools/train_net.py", line 239, in main
torch._C._cuda_setDevice(device)
main()
torch._C._cuda_setDevice(device)
File "tools/train_net.py", line 239, in main
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch._C._cuda_setDevice(device)
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34
Traceback (most recent call last):
File "tools/train_net.py", line 328, in
main()
File "tools/train_net.py", line 239, in main
torch.cuda.set_device(args.local_rank)
File "/opt/conda/lib/python3.6/site-packages/torch/cuda/init.py", line 264, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at torch/csrc/cuda/Module.cpp:34