This could be a dask issue but I am getting an error related to pynvml when launching a dask cluster:
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/nanny.py", line 674, in run
await worker
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/worker.py", line 1016, in start
await self._register_with_scheduler()
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/worker.py", line 811, in _register_with_scheduler
metrics=await self.get_metrics(),
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/worker.py", line 740, in get_metrics
result = await result
File "/home/nfs/bzaitlen/miniconda3/envs/cudf-dev/lib/python3.7/site-packages/tornado/gen.py", line 742, in run
yielded = self.gen.throw(*exc_info) # type: ignore
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/worker.py", line 3406, in gpu_metric
result = yield offload(nvml.real_time)
File "/home/nfs/bzaitlen/miniconda3/envs/cudf-dev/lib/python3.7/site-packages/tornado/gen.py", line 735, in run
value = future.result()
File "/home/nfs/bzaitlen/miniconda3/envs/cudf-dev/lib/python3.7/site-packages/tornado/gen.py", line 742, in run
yielded = self.gen.throw(*exc_info) # type: ignore
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/utils.py", line 1489, in offload
return (yield _offload_executor.submit(fn, *args, **kwargs))
File "/home/nfs/bzaitlen/miniconda3/envs/cudf-dev/lib/python3.7/site-packages/tornado/gen.py", line 735, in run
value = future.result()
File "/home/nfs/bzaitlen/miniconda3/envs/cudf-dev/lib/python3.7/concurrent/futures/_base.py", line 425, in result
return self.__get_result()
File "/home/nfs/bzaitlen/miniconda3/envs/cudf-dev/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
File "/home/nfs/bzaitlen/miniconda3/envs/cudf-dev/lib/python3.7/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/diagnostics/nvml.py", line 11, in real_time
"utilization": [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles],
File "/home/nfs/bzaitlen/GitRepos/distributed/distributed/diagnostics/nvml.py", line 11, in <listcomp>
"utilization": [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles],
File "/home/nfs/bzaitlen/GitRepos/pynvml/pynvml/nvml.py", line 1347, in nvmlDeviceGetUtilizationRates
check_return(ret)
File "/home/nfs/bzaitlen/GitRepos/pynvml/pynvml/nvml.py", line 366, in check_return
raise NVMLError(ret)
pynvml.nvml.NVMLError_Uninitialized: Uninitialized
This could be a dask issue but I am getting an error related to pynvml when launching a dask cluster:
cc @mrocklin