Open andy108369 opened 3 months ago
This issue appears to be resolved by 0.6.5-rc2
as detailed below and via a sandbox provider install and issue reproduction.
grpcurl -insecure provider.akashtestprovider.xyz:8444 akash.provider.v1.ProviderRPC.GetStatus
{
"cluster": {
"leases": {},
"inventory": {
"cluster": {
"nodes": [
{
"name": "provider",
"resources": {
"cpu": {
"quantity": {
"allocatable": {
"string": "16"
},
"allocated": {
"string": "3051m"
}
},
"info": [
{
"id": "0",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) CPU @ 2.30GHz",
"vcores": 16
}
]
},
"memory": {
"quantity": {
"allocatable": {
"string": "63193980928"
},
"allocated": {
"string": "2022Mi"
}
}
},
"gpu": {
"quantity": {
"allocatable": {
"string": "1"
},
"allocated": {
"string": "0"
}
},
"info": [
{
"vendor": "nvidia",
"name": "t4",
"modelid": "1eb8",
"interface": "PCIe",
"memorySize": "16Gi"
}
]
},
"ephemeralStorage": {
"allocatable": {
"string": "494114572106"
},
"allocated": {
"string": "0"
}
},
"volumesAttached": {
"allocatable": {
"string": "0"
},
"allocated": {
"string": "0"
}
},
"volumesMounted": {
"allocatable": {
"string": "0"
},
"allocated": {
"string": "0"
}
}
},
"capabilities": {}
}
]
},
"reservations": {
"pending": {
"resources": {
"cpu": {
"string": "0"
},
"memory": {
"string": "0"
},
"gpu": {
"string": "0"
},
"ephemeralStorage": {
"string": "0"
}
}
},
"active": {
"resources": {
"cpu": {
"string": "0"
},
"memory": {
"string": "0"
},
"gpu": {
"string": "0"
},
"ephemeralStorage": {
"string": "0"
}
}
}
}
}
},
"bidEngine": {},
"manifest": {},
"publicHostnames": [
"provider.akashtestprovider.xyz"
],
"timestamp": "2024-12-03T16:22:15.050654567Z"
}
kubectl edit daemonset nvdp-nvidia-device-plugin -n nvidia-device-plugin
- settings before:
securityContext:
capabilities:
add:
- SYS_ADMIN
- settings After
securityContext:
capabilities:
drop:
- SYS_ADMIN
grpcurl -insecure provider.akashtestprovider.xyz:8444 akash.provider.v1.ProviderRPC.GetStatus
{
"cluster": {
"leases": {},
"inventory": {
"cluster": {
"nodes": [
{
"name": "provider",
"resources": {
"cpu": {
"quantity": {
"allocatable": {
"string": "16"
},
"allocated": {
"string": "2050m"
}
},
"info": [
{
"id": "0",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) CPU @ 2.30GHz",
"vcores": 16
}
]
},
"memory": {
"quantity": {
"allocatable": {
"string": "63193980928"
},
"allocated": {
"string": "998Mi"
}
}
},
"gpu": {
"quantity": {
"allocatable": {
"string": "0"
},
"allocated": {
"string": "0"
}
},
"info": [
{
"vendor": "nvidia",
"name": "t4",
"modelid": "1eb8",
"interface": "PCIe",
"memorySize": "16Gi"
}
]
},
"ephemeralStorage": {
"allocatable": {
"string": "494114572106"
},
"allocated": {
"string": "0"
}
},
"volumesAttached": {
"allocatable": {
"string": "0"
},
"allocated": {
"string": "0"
}
},
"volumesMounted": {
"allocatable": {
"string": "0"
},
"allocated": {
"string": "0"
}
}
},
"capabilities": {}
}
]
},
"reservations": {
"pending": {
"resources": {
"cpu": {
"string": "0"
},
"memory": {
"string": "0"
},
"gpu": {
"string": "0"
},
"ephemeralStorage": {
"string": "0"
}
}
},
"active": {
"resources": {
"cpu": {
"string": "0"
},
"memory": {
"string": "0"
},
"gpu": {
"string": "0"
},
"ephemeralStorage": {
"string": "0"
}
}
}
}
}
},
"bidEngine": {},
"manifest": {},
"publicHostnames": [
"provider.akashtestprovider.xyz"
],
"timestamp": "2024-12-03T17:07:19.072779701Z"
}
root@provider:~# kubectl logs nvdp-nvidia-device-plugin-nkdd9 -n nvidia-device-plugin
I1203 17:08:06.036618 1 main.go:199] Starting FS watcher.
I1203 17:08:06.036695 1 main.go:206] Starting OS watcher.
I1203 17:08:06.037001 1 main.go:221] Starting Plugins.
I1203 17:08:06.037037 1 main.go:278] Loading configuration.
I1203 17:08:06.037670 1 main.go:303] Updating config with default resource matching patterns.
I1203 17:08:06.037886 1 main.go:314]
Running with config:
{
"version": "v1",
"flags": {
"migStrategy": "none",
"failOnInitError": true,
"mpsRoot": "/run/nvidia/mps",
"nvidiaDriverRoot": "/",
"nvidiaDevRoot": "/",
"gdsEnabled": false,
"mofedEnabled": false,
"useNodeFeatureAPI": null,
"deviceDiscoveryStrategy": "auto",
"plugin": {
"passDeviceSpecs": false,
"deviceListStrategy": [
"volume-mounts"
],
"deviceIDStrategy": "uuid",
"cdiAnnotationPrefix": "cdi.k8s.io/",
"nvidiaCTKPath": "/usr/bin/nvidia-ctk",
"containerDriverRoot": "/driver-root"
}
},
"resources": {
"gpus": [
{
"pattern": "*",
"name": "nvidia.com/gpu"
}
]
},
"sharing": {
"timeSlicing": {}
}
}
I1203 17:08:06.037901 1 main.go:317] Retrieving plugins.
E1203 17:08:06.038066 1 factory.go:87] Incompatible strategy detected auto
E1203 17:08:06.038116 1 factory.go:88] If this is a GPU node, did you configure the NVIDIA Container Toolkit?
E1203 17:08:06.038126 1 factory.go:89] You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
E1203 17:08:06.038142 1 factory.go:90] You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
E1203 17:08:06.038153 1 factory.go:91] If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
E1203 17:08:06.038306 1 main.go:149] error starting plugins: error creating plugin manager: unable to create plugin manager: invalid device discovery strategy
securityContext:
capabilities:
add:
- SYS_ADMIN
root@provider:~# grpcurl -insecure provider.akashtestprovider.xyz:8444 akash.provider.v1.ProviderRPC.GetStatus
{
"cluster": {
"leases": {},
"inventory": {
"cluster": {
"nodes": [
{
"name": "provider",
"resources": {
"cpu": {
"quantity": {
"allocatable": {
"string": "16"
},
"allocated": {
"string": "2050m"
}
},
"info": [
{
"id": "0",
"vendor": "GenuineIntel",
"model": "Intel(R) Xeon(R) CPU @ 2.30GHz",
"vcores": 16
}
]
},
"memory": {
"quantity": {
"allocatable": {
"string": "63193980928"
},
"allocated": {
"string": "998Mi"
}
}
},
"gpu": {
"quantity": {
"allocatable": {
"string": "1"
},
"allocated": {
"string": "0"
}
},
"info": [
{
"vendor": "nvidia",
"name": "t4",
"modelid": "1eb8",
"interface": "PCIe",
"memorySize": "16Gi"
}
]
},
"ephemeralStorage": {
"allocatable": {
"string": "494114572106"
},
"allocated": {
"string": "0"
}
},
"volumesAttached": {
"allocatable": {
"string": "0"
},
"allocated": {
"string": "0"
}
},
"volumesMounted": {
"allocatable": {
"string": "0"
},
"allocated": {
"string": "0"
}
}
},
"capabilities": {}
}
]
},
"reservations": {
"pending": {
"resources": {
"cpu": {
"string": "0"
},
"memory": {
"string": "0"
},
"gpu": {
"string": "0"
},
"ephemeralStorage": {
"string": "0"
}
}
},
"active": {
"resources": {
"cpu": {
"string": "0"
},
"memory": {
"string": "0"
},
"gpu": {
"string": "0"
},
"ephemeralStorage": {
"string": "0"
}
}
}
}
}
},
"bidEngine": {},
"manifest": {},
"publicHostnames": [
"provider.akashtestprovider.xyz"
],
"timestamp": "2024-12-03T17:12:30.754124642Z"
}
Additional note - issue originally cited improper reporting via the status endpoint. Proof that endpoint also reports proper allocatable/available GPU numbers with fixes in this RC:
And that endpoint - as we would expect - is reporting valid numbers now as well. Such as:
curl -ks https://10.43.227.108:8443/status
{"cluster":{"leases":0,"inventory":{"available":{"nodes":[{"name":"provider","allocatable":{"cpu":16000,"gpu":1,"memory":63193980928,"storage_ephemeral":494114572106},"available":{"cpu":13950,"gpu":1,"memory":62147502080,"storage_ephemeral":494114572106}}]}}},"bidengine":{"orders":0},"manifest":{"deployments":0},"cluster_public_hostname":"provider.akashtestprovider.xyz","address":"akash1ggk74pf9avxh3llu30yfhmr345h2yrpf7c2cdu"
curl -ks https://10.43.227.108:8443/status
{"cluster":{"leases":0,"inventory":{"available":{"nodes":[{"name":"provider","allocatable":{"cpu":16000,"gpu":0,"memory":63193980928,"storage_ephemeral":494114572106},"available":{"cpu":13950,"gpu":0,"memory":62147502080,"storage_ephemeral":494114572106}}]}}},"bidengine":{"orders":0},"manifest":{"deployments":0},"cluster_public_hostname":"provider.akashtestprovider.xyz","address":"akash1ggk74pf9avxh3llu30yfhmr345h2yrpf7c2cdu"}
curl -ks https://10.43.227.108:8443/status
{"cluster":{"leases":0,"inventory":{"available":{"nodes":[{"name":"provider","allocatable":{"cpu":16000,"gpu":1,"memory":63193980928,"storage_ephemeral":494114572106},"available":{"cpu":13950,"gpu":1,"memory":62147502080,"storage_ephemeral":494114572106}}]}}},"bidengine":{"orders":0},"manifest":{"deployments":0},"cluster_public_hostname":"provider.akashtestprovider.xyz","address":"akash1ggk74pf9avxh3llu30yfhmr345h2yrpf7c2cdu"}
Logs https://gist.github.com/andy108369/cac9f968f1c6a3eb7c6e92135b8afd42
querying 8443/status endpoint would report all 8 GPUs are available, but at least one was marked as unhealthy.
Rarely you can recover from this error by bouncing
nvdp-nvidia-device-plugin
pod on the node where it was marked unhealthy. But the point is that inventory-operator should ideally detect this as otherwise GPU deployments will be stuck in "Pending" until all 8 GPUs will become available again: