it looks like the PCIe and NVLink info is not supported, but jupyterlab-nvdashboard assumes they're supported...
$ docker run -it --gpus all nvcr.io/nvaie/nvidia-rapids:21.08-cuda11.4-ubuntu20.04-py3.8 bash This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdeveloper.download.nvidia.com%2Flicenses%2FNVIDIA_Deep_Learning_Container_License.pdf&data=04%7C01%7Cbklobucher%40nvidia.com%7C374c2b1a88514a766c8e08d99eca77e1%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637715414696240549%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=8%2F0L6kTCbFy4irrvUAo1y2TSLbJ3KP2vAPsRhFinDV8%3D&reserved=0
(rapids) root@955ce2376d75:/workspace# conda list | grep nvml
pynvml 11.0.0 pyhd8ed1ab_0 conda-forge
(rapids) root@955ce2376d75:/workspace# conda list jupyterlab-nvdashboard # packages in environment at /opt/conda/envs/rapids:
#
# Name Version Build Channel
jupyterlab-nvdashboard 0.6.0 py_0 rapidsai
(rapids) root@955ce2376d75:/workspace# nvidia-smi Wed Nov 3 00:18:24 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 GRID T4-16C On | 00000000:02:00.0 Off | 0 |
| N/A N/A P8 N/A / N/A | 2220MiB / 16384MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
(rapids) root@955ce2376d75:/workspace# grep nvmlDeviceGet /opt/conda/envs/rapids/lib/python3.8/site-packages/jupyterlab_nvdashboard/apps/gpu.py
ngpus = pynvml.nvmlDeviceGetCount()
gpu_handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(ngpus)]
pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
return [pynvml.nvmlDeviceGetMemoryInfo(handle).used for handle in gpu_handles]
return pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[0]).total
pci_gen = pynvml.nvmlDeviceGetMaxPcieLinkGeneration(gpu_handles[0])
pci_width = pynvml.nvmlDeviceGetMaxPcieLinkWidth(gpu_handles[0])
pynvml.nvmlDeviceGetPcieThroughput(
pynvml.nvmlDeviceGetPcieThroughput(
pynvml.nvmlDeviceGetPcieThroughput(
pynvml.nvmlDeviceGetPcieThroughput(
nvlink_ver = pynvml.nvmlDeviceGetNvLinkVersion(gpu_handles[0], 0)
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 * 1024)
gpu = pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[i]).used
pynvml.nvmlDeviceGetPcieThroughput(
pynvml.nvmlDeviceGetPcieThroughput(
(rapids) root@955ce2376d75:/workspace# python Python 3.8.10 | packaged by conda-forge | (default, May 11 2021, 07:01:05) [GCC 9.3.0] on linux Type "help", "copyright", "credits" or "license" for more information.
>>> from pynvml import *
>>> nvmlInit()
>>> nvmlSystemGetDriverVersion()
b'470.63.01'
>>> nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(0))
b'GRID T4-16C'
>>> nvmlDeviceGetPcieThroughput(nvmlDeviceGetHandleByIndex(0),
>>> NVML_PCIE_UTIL_TX_BYTES)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 2542, in nvmlDeviceGetPcieThroughput
_nvmlCheckReturn(ret)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 743, in _nvmlCheckReturn
raise NVMLError(ret)
pynvml.nvml.NVMLError_NotSupported: Not Supported
>>> nvmlDeviceGetNvLinkVersion(nvmlDeviceGetHandleByIndex(0), 0)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 2656, in nvmlDeviceGetNvLinkVersion
_nvmlCheckReturn(ret)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 743, in _nvmlCheckReturn
raise NVMLError(ret)
pynvml.nvml.NVMLError_NotSupported: Not Supported
>>> nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(0)).total
17179869184
>>> nvmlDeviceGetUtilizationRates(nvmlDeviceGetHandleByIndex(0)).gpu
0
Below is the NVML stack trace
ERROR:bokeh.util.tornado:Traceback (most recent call last):
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/tornado/gen.py", line 526, in callback
result_list.append(f.result())
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/server/session.py", line 67, in _needs_document_lock_wrapper
result = func(self, *args, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/server/session.py", line 195, in with_document_locked
return func(*args, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/document/document.py", line 1212, in wrapper
return doc._with_self_as_curdoc(invoke)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/document/document.py", line 1198, in _with_self_as_curdoc
return f()
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/document/document.py", line 1211, in invoke
return f(*args, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/jupyterlab_nvdashboard/apps/gpu.py", line 575, in cb
pynvml.nvmlDeviceGetPcieThroughput(
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 2542, in nvmlDeviceGetPcieThroughput
_nvmlCheckReturn(ret)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 743, in _nvmlCheckReturn
raise NVMLError(ret)
pynvml.nvml.NVMLError_NotSupported: Not Supported
ERROR:bokeh.util.tornado:Error thrown from periodic callback:
ERROR:bokeh.util.tornado:Traceback (most recent call last):
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/tornado/gen.py", line 526, in callback
result_list.append(f.result())
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/server/session.py", line 67, in _needs_document_lock_wrapper
result = func(self, *args, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/server/session.py", line 195, in with_document_locked
return func(*args, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/document/document.py", line 1212, in wrapper
return doc._with_self_as_curdoc(invoke)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/document/document.py", line 1198, in _with_self_as_curdoc
return f()
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/bokeh/document/document.py", line 1211, in invoke
return f(*args, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/jupyterlab_nvdashboard/apps/gpu.py", line 575, in cb
pynvml.nvmlDeviceGetPcieThroughput(
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 2542, in nvmlDeviceGetPcieThroughput
_nvmlCheckReturn(ret)
File "/opt/conda/envs/rapids/lib/python3.8/site-packages/pynvml/nvml.py", line 743, in _nvmlCheckReturn
raise NVMLError(ret)
pynvml.nvml.NVMLError_NotSupported: Not Supported
it looks like the PCIe and NVLink info is not supported, but jupyterlab-nvdashboard assumes they're supported...