Open minsub0922 opened 1 year ago
root@1dd007c03d48:/horovod# horovodrun -np 1 -H localhost:1 python examples/pytorch/pytorch_mnist.py
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO Bootstrap : Using lo:127.0.0.1<0>
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO Failed to open libibverbs.so[.1]
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO NET/Socket : Using [0]lo:127.0.0.1<0>
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO Using network Socket
[1,0]<stdout>:NCCL version 2.12.12+cuda11.6
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stderr>:Traceback (most recent call last):
[1,0]<stderr>: File "/usr/local/lib/python3.8/dist-packages/horovod/torch/mpi_ops.py", line 1285, in synchronize
[1,0]<stderr>: mpi_lib.horovod_torch_wait_and_clear(handle)
[1,0]<stderr>:RuntimeError: ncclCommInitRank failed: unhandled cuda error
[1,0]<stderr>:
[1,0]<stderr>:During handling of the above exception, another exception occurred:
[1,0]<stderr>:
[1,0]<stderr>:Traceback (most recent call last):
[1,0]<stderr>: File "examples/pytorch/pytorch_mnist.py", line 263, in <module>
[1,0]<stderr>: main(args)
[1,0]<stderr>: File "examples/pytorch/pytorch_mnist.py", line 222, in main
[1,0]<stderr>: hvd.broadcast_parameters(model.state_dict(), root_rank=0)
[1,0]<stderr>: File "/usr/local/lib/python3.8/dist-packages/horovod/torch/functions.py", line 59, in broadcast_parameters
[1,0]<stderr>: synchronize(handle)
[1,0]<stderr>: File "/usr/local/lib/python3.8/dist-packages/horovod/torch/mpi_ops.py", line 1290, in synchronize
[1,0]<stderr>: raise HorovodInternalError(e)
[1,0]<stderr>:horovod.common.exceptions.HorovodInternalError: ncclCommInitRank failed: unhandled cuda error
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] init.cc:255 NCCL WARN Cuda failure 'CUDA driver is a stub library'
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:913 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:950 -> 1
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:963 -> 1
[1,0]<stdout>:
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] misc/argcheck.cc:30 NCCL WARN ncclGetAsyncError : comm argument is NULL
[1,0]<stdout>:1dd007c03d48:5078:5144 [0] NCCL INFO init.cc:1084 -> 4
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[5094,1],0]
Exit code: 1
--------------------------------------------------------------------------
Docker File Error