271b488e-6801-ad8b-6811-bbfebebb285e","Count":10,"Devmem":22528,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":false}]
I0814 11:17:32.683919 6623 util.go:128] Encoded node Devices: GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:
I0814 11:17:32.683997 6623 register.go:176] patch node with the following annos map[hami.io/node-handshake:Reported 2024-08-14 11:17:32.683965176 +0000 UTC m=+1419629.497106978 hami.io/node-nvidia-register:GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:]
I0814 11:17:32.725644 6623 register.go:196] Successfully registered annotation. Next check in 30s seconds...
I0814 11:17:54.699758 6623 register.go:131] MemoryScaling= 1 registeredmem= 11264
E0814 11:17:54.718619 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=0
I0814 11:17:54.718646 6623 register.go:159] nvml registered device id=1, memory=11264, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:17:54.718722 6623 register.go:131] MemoryScaling= 1 registeredmem= 22528
E0814 11:17:54.737675 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=1
I0814 11:17:54.737697 6623 register.go:159] nvml registered device id=2, memory=22528, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:17:54.737720 6623 register.go:166] "start working on the devices" devices=[{"Index":0,"Id":"GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a","Count":10,"Devmem":11264,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":true},{"Index":0,"Id":"GPU-271b488e-6801-ad8b-6811-bbfebebb285e","Count":10,"Devmem":22528,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":true}]
I0814 11:17:54.745370 6623 util.go:128] Encoded node Devices: GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:
I0814 11:17:54.745406 6623 register.go:176] patch node with the following annos map[hami.io/node-handshake:Reported 2024-08-14 11:17:54.745386452 +0000 UTC m=+1419651.558528254 hami.io/node-nvidia-register:GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:]
I0814 11:17:54.786685 6623 register.go:196] Successfully registered annotation. Next check in 30s seconds...
I0814 11:18:02.726492 6623 register.go:131] MemoryScaling= 1 registeredmem= 11264
E0814 11:18:02.744360 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=0
I0814 11:18:02.744386 6623 register.go:159] nvml registered device id=1, memory=11264, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:02.744465 6623 register.go:131] MemoryScaling= 1 registeredmem= 22528
E0814 11:18:02.762666 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=1
I0814 11:18:02.762686 6623 register.go:159] nvml registered device id=2, memory=22528, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:02.762708 6623 register.go:166] "start working on the devices" devices=[{"Index":0,"Id":"GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a","Count":10,"Devmem":11264,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":false},{"Index":0,"Id":"GPU-271b488e-6801-ad8b-6811-bbfebebb285e","Count":10,"Devmem":22528,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":false}]
I0814 11:18:02.769978 6623 util.go:128] Encoded node Devices: GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:
I0814 11:18:02.770028 6623 register.go:176] patch node with the following annos map[hami.io/node-handshake:Reported 2024-08-14 11:18:02.770000669 +0000 UTC m=+1419659.583142471 hami.io/node-nvidia-register:GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:]
I0814 11:18:02.810168 6623 register.go:196] Successfully registered annotation. Next check in 30s seconds...
I0814 11:18:24.805913 6623 register.go:131] MemoryScaling= 1 registeredmem= 11264
E0814 11:18:24.822966 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=0
I0814 11:18:24.822991 6623 register.go:159] nvml registered device id=1, memory=11264, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:24.823065 6623 register.go:131] MemoryScaling= 1 registeredmem= 22528
E0814 11:18:24.834024 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=1
I0814 11:18:24.834041 6623 register.go:159] nvml registered device id=2, memory=22528, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:24.834062 6623 register.go:166] "start working on the devices" devices=[{"Index":0,"Id":"GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a","Count":10,"Devmem":11264,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":true},{"Index":0,"Id":"GPU-271b488e-6801-ad8b-6811-bbfebebb285e","Count":10,"Devmem":22528,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":true}]
I0814 11:18:24.841508 6623 util.go:128] Encoded node Devices: GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:
I0814 11:18:24.841554 6623 register.go:176] patch node with the following annos map[hami.io/node-handshake:Reported 2024-08-14 11:18:24.841527879 +0000 UTC m=+1419681.654669681 hami.io/node-nvidia-register:GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:]
I0814 11:18:24.884709 6623 register.go:196] Successfully registered annotation. Next check in 30s seconds...
I0814 11:18:32.815798 6623 register.go:131] MemoryScaling= 1 registeredmem= 11264
E0814 11:18:32.834401 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=0
I0814 11:18:32.834429 6623 register.go:159] nvml registered device id=1, memory=11264, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:32.834504 6623 register.go:131] MemoryScaling= 1 registeredmem= 22528
E0814 11:18:32.853124 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=1
I0814 11:18:32.853140 6623 register.go:159] nvml registered device id=2, memory=22528, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:32.853163 6623 register.go:166] "start working on the devices" devices=[{"Index":0,"Id":"GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a","Count":10,"Devmem":11264,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":false},{"Index":0,"Id":"GPU-271b488e-6801-ad8b-6811-bbfebebb285e","Count":10,"Devmem":22528,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":false}]
I0814 11:18:32.860586 6623 util.go:128] Encoded node Devices: GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:
I0814 11:18:32.860634 6623 register.go:176] patch node with the following annos map[hami.io/node-handshake:Reported 2024-08-14 11:18:32.860609992 +0000 UTC m=+1419689.673751784 hami.io/node-nvidia-register:GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:]
I0814 11:18:32.907729 6623 register.go:196] Successfully registered annotation. Next check in 30s seconds...
I0814 11:18:54.904814 6623 register.go:131] MemoryScaling= 1 registeredmem= 11264
E0814 11:18:54.924491 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=0
I0814 11:18:54.924520 6623 register.go:159] nvml registered device id=1, memory=11264, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:54.924598 6623 register.go:131] MemoryScaling= 1 registeredmem= 22528
E0814 11:18:54.943078 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=1
I0814 11:18:54.943098 6623 register.go:159] nvml registered device id=2, memory=22528, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:18:54.943120 6623 register.go:166] "start working on the devices" devices=[{"Index":0,"Id":"GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a","Count":10,"Devmem":11264,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":true},{"Index":0,"Id":"GPU-271b488e-6801-ad8b-6811-bbfebebb285e","Count":10,"Devmem":22528,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":true}]
I0814 11:18:54.952281 6623 util.go:128] Encoded node Devices: GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:
I0814 11:18:54.952341 6623 register.go:176] patch node with the following annos map[hami.io/node-handshake:Reported 2024-08-14 11:18:54.952317947 +0000 UTC m=+1419711.765459749 hami.io/node-nvidia-register:GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,true:]
I0814 11:18:54.994680 6623 register.go:196] Successfully registered annotation. Next check in 30s seconds...
I0814 11:19:02.908939 6623 register.go:131] MemoryScaling= 1 registeredmem= 11264
E0814 11:19:02.926555 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=0
I0814 11:19:02.926598 6623 register.go:159] nvml registered device id=1, memory=11264, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:19:02.926673 6623 register.go:131] MemoryScaling= 1 registeredmem= 22528
E0814 11:19:02.945609 6623 register.go:147] "failed to get numa information" err="exit status 255" idx=1
I0814 11:19:02.945626 6623 register.go:159] nvml registered device id=2, memory=22528, type=NVIDIA GeForce RTX 2080 Ti, numa=0
I0814 11:19:02.945648 6623 register.go:166] "start working on the devices" devices=[{"Index":0,"Id":"GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a","Count":10,"Devmem":11264,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":false},{"Index":0,"Id":"GPU-271b488e-6801-ad8b-6811-bbfebebb285e","Count":10,"Devmem":22528,"Devcore":100,"Type":"NVIDIA-NVIDIA GeForce RTX 2080 Ti","Numa":0,"Health":false}]
I0814 11:19:02.953035 6623 util.go:128] Encoded node Devices: GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:
I0814 11:19:02.953092 6623 register.go:176] patch node with the following annos map[hami.io/node-handshake:Reported 2024-08-14 11:19:02.953065957 +0000 UTC m=+1419719.766207749 hami.io/node-nvidia-register:GPU-61101651-d822-7cac-e1f7-bd627a4b7d5a,10,11264,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:GPU-271b488e-6801-ad8b-6811-bbfebebb285e,10,22528,100,NVIDIA-NVIDIA GeForce RTX 2080 Ti,0,false:]
2.节点驱动正常,nvidia-smi topo信息如下(此时插件重启过了)
root@k8s-121:~# nvidia-smi topo -m
GPU0 GPU1 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X SYS 0-95 0 N/A
GPU1 SYS X 0-95 0 N/A
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
root@k8s-121:~# nvidia-smi
Wed Aug 14 20:07:11 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.58 Driver Version: 555.58 CUDA Version: 12.5 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 2080 Ti Off | 00000000:41:00.0 Off | N/A |
| 0% 35C P8 21W / 280W | 1MiB / 11264MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 2080 Ti Off | 00000000:81:00.0 Off | N/A |
| 0% 35C P8 1W / 280W | 1MiB / 22528MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
1.报错日志如下
2.节点驱动正常,nvidia-smi topo信息如下(此时插件重启过了)
附上插件日志 1.log