NVIDIA / k8s-dra-driver

Dynamic Resource Allocation (DRA) for NVIDIA GPUs in Kubernetes
Apache License 2.0
195 stars 36 forks source link

dra_plugin has no nvidia-smi #99

Open Catbyxx opened 3 months ago

Catbyxx commented 3 months ago

kubectl logs -n nvidia-dra-driver nvidia-k8s-dra-driver-kubelet-plugin-cwnhs      

Defaulted container "plugin" out of: plugin, init (init)
W0415 08:30:15.764261       1 warnings.go:70] unknown field "spec.allocatableDevices[0].gpu.cudaDriverVersion"
W0415 08:30:15.764372       1 warnings.go:70] unknown field "spec.allocatableDevices[0].gpu.driverVersion"
W0415 08:30:15.764382       1 warnings.go:70] unknown field "spec.allocatableDevices[1].gpu.cudaDriverVersion"
W0415 08:30:15.764391       1 warnings.go:70] unknown field "spec.allocatableDevices[1].gpu.driverVersion"
W0415 08:30:15.764400       1 warnings.go:70] unknown field "spec.allocatableDevices[2].gpu.cudaDriverVersion"
W0415 08:30:15.764408       1 warnings.go:70] unknown field "spec.allocatableDevices[2].gpu.driverVersion"
W0415 08:30:15.764419       1 warnings.go:70] unknown field "spec.allocatableDevices[3].gpu.cudaDriverVersion"
W0415 08:30:15.764427       1 warnings.go:70] unknown field "spec.allocatableDevices[3].gpu.driverVersion"
W0415 08:30:15.764436       1 warnings.go:70] unknown field "spec.allocatableDevices[4].gpu.cudaDriverVersion"
W0415 08:30:15.764445       1 warnings.go:70] unknown field "spec.allocatableDevices[4].gpu.driverVersion"
W0415 08:30:15.764454       1 warnings.go:70] unknown field "spec.allocatableDevices[5].gpu.cudaDriverVersion"
W0415 08:30:15.764463       1 warnings.go:70] unknown field "spec.allocatableDevices[5].gpu.driverVersion"
W0415 08:30:15.764471       1 warnings.go:70] unknown field "spec.allocatableDevices[6].gpu.cudaDriverVersion"
W0415 08:30:15.764480       1 warnings.go:70] unknown field "spec.allocatableDevices[6].gpu.driverVersion"
W0415 08:30:15.764489       1 warnings.go:70] unknown field "spec.allocatableDevices[7].gpu.cudaDriverVersion"
W0415 08:30:15.764497       1 warnings.go:70] unknown field "spec.allocatableDevices[7].gpu.driverVersion"
I0415 08:30:22.111877       1 device_state.go:146] using devRoot=/driver-root
W0415 08:31:20.942191       1 warnings.go:70] unknown field "spec.allocatableDevices[0].gpu.cudaDriverVersion"
W0415 08:31:20.942263       1 warnings.go:70] unknown field "spec.allocatableDevices[0].gpu.driverVersion"
W0415 08:31:20.942268       1 warnings.go:70] unknown field "spec.allocatableDevices[1].gpu.cudaDriverVersion"
W0415 08:31:20.942273       1 warnings.go:70] unknown field "spec.allocatableDevices[1].gpu.driverVersion"
W0415 08:31:20.942277       1 warnings.go:70] unknown field "spec.allocatableDevices[2].gpu.cudaDriverVersion"
W0415 08:31:20.942281       1 warnings.go:70] unknown field "spec.allocatableDevices[2].gpu.driverVersion"
W0415 08:31:20.942286       1 warnings.go:70] unknown field "spec.allocatableDevices[3].gpu.cudaDriverVersion"
W0415 08:31:20.942290       1 warnings.go:70] unknown field "spec.allocatableDevices[3].gpu.driverVersion"
W0415 08:31:20.942294       1 warnings.go:70] unknown field "spec.allocatableDevices[4].gpu.cudaDriverVersion"
W0415 08:31:20.942298       1 warnings.go:70] unknown field "spec.allocatableDevices[4].gpu.driverVersion"
W0415 08:31:20.942302       1 warnings.go:70] unknown field "spec.allocatableDevices[5].gpu.cudaDriverVersion"
W0415 08:31:20.942306       1 warnings.go:70] unknown field "spec.allocatableDevices[5].gpu.driverVersion"
W0415 08:31:20.942310       1 warnings.go:70] unknown field "spec.allocatableDevices[6].gpu.cudaDriverVersion"
W0415 08:31:20.942314       1 warnings.go:70] unknown field "spec.allocatableDevices[6].gpu.driverVersion"
W0415 08:31:20.942318       1 warnings.go:70] unknown field "spec.allocatableDevices[7].gpu.cudaDriverVersion"
W0415 08:31:20.942322       1 warnings.go:70] unknown field "spec.allocatableDevices[7].gpu.driverVersion"
W0415 08:31:20.956293       1 warnings.go:70] unknown field "spec.allocatableDevices[0].gpu.cudaDriverVersion"
W0415 08:31:20.956320       1 warnings.go:70] unknown field "spec.allocatableDevices[0].gpu.driverVersion"
W0415 08:31:20.956330       1 warnings.go:70] unknown field "spec.allocatableDevices[1].gpu.cudaDriverVersion"
W0415 08:31:20.956338       1 warnings.go:70] unknown field "spec.allocatableDevices[1].gpu.driverVersion"
W0415 08:31:20.956346       1 warnings.go:70] unknown field "spec.allocatableDevices[2].gpu.cudaDriverVersion"
W0415 08:31:20.956354       1 warnings.go:70] unknown field "spec.allocatableDevices[2].gpu.driverVersion"
W0415 08:31:20.956362       1 warnings.go:70] unknown field "spec.allocatableDevices[3].gpu.cudaDriverVersion"
W0415 08:31:20.956370       1 warnings.go:70] unknown field "spec.allocatableDevices[3].gpu.driverVersion"
W0415 08:31:20.956379       1 warnings.go:70] unknown field "spec.allocatableDevices[4].gpu.cudaDriverVersion"
W0415 08:31:20.956387       1 warnings.go:70] unknown field "spec.allocatableDevices[4].gpu.driverVersion"
W0415 08:31:20.956394       1 warnings.go:70] unknown field "spec.allocatableDevices[5].gpu.cudaDriverVersion"
W0415 08:31:20.956403       1 warnings.go:70] unknown field "spec.allocatableDevices[5].gpu.driverVersion"
W0415 08:31:20.956412       1 warnings.go:70] unknown field "spec.allocatableDevices[6].gpu.cudaDriverVersion"
W0415 08:31:20.956420       1 warnings.go:70] unknown field "spec.allocatableDevices[6].gpu.driverVersion"
W0415 08:31:20.956428       1 warnings.go:70] unknown field "spec.allocatableDevices[7].gpu.cudaDriverVersion"
W0415 08:31:20.956436       1 warnings.go:70] unknown field "spec.allocatableDevices[7].gpu.driverVersion"
I0415 08:31:20.975031       1 nonblockinggrpcserver.go:105] "GRPC server started" logger="dra"
I0415 08:31:20.975212       1 nonblockinggrpcserver.go:105] "GRPC server started" logger="registrar"
Catbyxx commented 3 months ago

root@nvidia-k8s-dra-driver-kubelet-plugin-stnbc:/# nvidia-smi bash: nvidia-smi: command not found

Catbyxx commented 3 months ago

I have installed NVIDIA Container Toolkit according the guide:

  1. sudo apt-get install -y nvidia-container-toolkit
  2. sudo nvidia-ctk runtime configure --runtime=containerd
  3. sudo systemctl restart containerd

Also I tried sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default, but no luck

Catbyxx commented 3 months ago

I‘m not using kind to deploy DRA. its on my local k8s environment which has 1 normal master and 2 gpu worker. It can get the gpu info when i use kubectl describe nas/gpu2 -n nvidia-dra-driver :


Name:         gpu2
Namespace:    nvidia-dra-driver
Labels:       <none>
Annotations:  <none>
API Version:  nas.gpu.resource.nvidia.com/v1alpha1
Kind:         NodeAllocationState
Metadata:
  Creation Timestamp:  2024-04-11T09:39:45Z
  Generation:          19
  Owner References:
    API Version:     v1
    Kind:            Node
    Name:            gpu2
    UID:             3ca77a28-25f6-4b89-a137-635987543afc
  Resource Version:  926080
  UID:               a91cfc16-9743-4b4e-b89a-e359734fc319
Spec:
  Allocatable Devices:
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    7
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-7cf93fdc-c371-c74c-5754-f345f2ddb6f8
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    0
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-1c292c2a-868c-f91a-22e1-281f1bf1593a
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    1
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-66dda9e9-faca-6eed-1a16-1d7d04850f0f
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    2
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-b4e1e38e-06ef-b632-baf1-a3a2c50353f9
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    3
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-c75f5a54-788d-0b56-e4f0-5beaac32f10e
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    4
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-3fc6215c-893e-f683-8f43-20df4caebe19
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    5
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-71288810-0f54-bf2d-0b88-2725a024a867
    Gpu:
      Architecture:             Ampere
      Brand:                    Nvidia
      Cuda Compute Capability:  8.0
      Index:                    6
      Memory Bytes:             42949672960
      Mig Enabled:              true
      Product Name:             NVIDIA A100-PCIE-40GB
      Uuid:                     GPU-ea5e7ad5-5309-edb6-a140-1d848d8f2fa3
    Mig:
      Parent Product Name:  NVIDIA A100-PCIE-40GB
      Placements:
        Size:   8
        Start:  0
      Profile:  7g.40gb
    Mig:
      Parent Product Name:  NVIDIA A100-PCIE-40GB
      Placements:
        Size:   1
        Start:  0
        Size:   1
        Start:  1
        Size:   1
        Start:  2
        Size:   1
        Start:  3
        Size:   1
        Start:  4
        Size:   1
        Start:  5
        Size:   1
        Start:  6
      Profile:  1g.5gb+me
    Mig:
      Parent Product Name:  NVIDIA A100-PCIE-40GB
      Placements:
        Size:   2
        Start:  0
        Size:   2
        Start:  2
        Size:   2
        Start:  4
        Size:   2
        Start:  6
      Profile:  1g.10gb
    Mig:
      Parent Product Name:  NVIDIA A100-PCIE-40GB
      Placements:
        Size:   1
        Start:  0
        Size:   1
        Start:  1
        Size:   1
        Start:  2
        Size:   1
        Start:  3
        Size:   1
        Start:  4
        Size:   1
        Start:  5
        Size:   1
        Start:  6
      Profile:  1g.5gb
    Mig:
      Parent Product Name:  NVIDIA A100-PCIE-40GB
      Placements:
        Size:   2
        Start:  0
        Size:   2
        Start:  2
        Size:   2
        Start:  4
      Profile:  2g.10gb
    Mig:
      Parent Product Name:  NVIDIA A100-PCIE-40GB
      Placements:
        Size:   4
        Start:  0
        Size:   4
        Start:  4
      Profile:  3g.20gb
    Mig:
      Parent Product Name:  NVIDIA A100-PCIE-40GB
      Placements:
        Size:   4
        Start:  0
      Profile:  4g.20gb
Status:         Ready
Events:         <none>
Catbyxx commented 3 months ago

When i use gpu-test1.yaml to test, pod is pending:


$ kubectl get pod -n gpu-test1
NAME   READY   STATUS    RESTARTS   AGE
pod1   0/1     Pending   0          15s
pod2   0/1     Pending   0          15s

$ kubectl logs -n nvidia-dra-driver nvidia-k8s-dra-driver-controller-6c7c6b995f-jm5jx
I0415 08:55:27.568069       1 controller.go:302] "Adding new work item" logger="resource controller" type="ResourceClaim" object="{\"metadata\":{\"name\":\"pod2-gpu-99zjz\",\"generateName\":\"pod2-gpu-\",\"namespace\":\"gpu-test1\",\"uid\":\"d34546e7-2cf4-4061-8e4b-e1837ff3abf4\",\"resourceVersion\":\"929182\",\"creationTimestamp\":\"2024-04-15T08:55:27Z\",\"annotations\":{\"resource.kubernetes.io/pod-claim-name\":\"gpu\"},\"ownerReferences\":[{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"name\":\"pod2\",\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\",\"controller\":true,\"blockOwnerDeletion\":true}],\"managedFields\":[{\"manager\":\"kube-controller-manager\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:metadata\":{\"f:annotations\":{\".\":{},\"f:resource.kubernetes.io/pod-claim-name\":{}},\"f:generateName\":{},\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"1b090750-72aa-4352-b7ca-c03017c17bc6\\\"}\":{}}},\"f:spec\":{\"f:allocationMode\":{},\"f:resourceClassName\":{}}}}]},\"spec\":{\"resourceClassName\":\"gpu.nvidia.com\",\"allocationMode\":\"WaitForFirstConsumer\"},\"status\":{}}" key="claim:gpu-test1/pod2-gpu-99zjz"
I0415 08:55:27.568252       1 controller.go:302] "Adding new work item" logger="resource controller" type="ResourceClaim" object="{\"metadata\":{\"name\":\"pod1-gpu-h7hr9\",\"generateName\":\"pod1-gpu-\",\"namespace\":\"gpu-test1\",\"uid\":\"c0b909c3-f033-460b-927d-1612a79fa8ab\",\"resourceVersion\":\"929183\",\"creationTimestamp\":\"2024-04-15T08:55:27Z\",\"annotations\":{\"resource.kubernetes.io/pod-claim-name\":\"gpu\"},\"ownerReferences\":[{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"name\":\"pod1\",\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\",\"controller\":true,\"blockOwnerDeletion\":true}],\"managedFields\":[{\"manager\":\"kube-controller-manager\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:metadata\":{\"f:annotations\":{\".\":{},\"f:resource.kubernetes.io/pod-claim-name\":{}},\"f:generateName\":{},\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"024aa85b-c07a-4e26-b32d-5d05ae650519\\\"}\":{}}},\"f:spec\":{\"f:allocationMode\":{},\"f:resourceClassName\":{}}}}]},\"spec\":{\"resourceClassName\":\"gpu.nvidia.com\",\"allocationMode\":\"WaitForFirstConsumer\"},\"status\":{}}" key="claim:gpu-test1/pod1-gpu-h7hr9"
I0415 08:55:27.568330       1 controller.go:373] "processing" logger="resource controller" key="claim:gpu-test1/pod2-gpu-99zjz"
I0415 08:55:27.568364       1 controller.go:517] "ResourceClaim waiting for first consumer" logger="resource controller" key="claim:gpu-test1/pod2-gpu-99zjz"
I0415 08:55:27.568434       1 controller.go:377] "completed" logger="resource controller" key="claim:gpu-test1/pod2-gpu-99zjz"
I0415 08:55:27.568483       1 controller.go:373] "processing" logger="resource controller" key="claim:gpu-test1/pod1-gpu-h7hr9"
I0415 08:55:27.568498       1 controller.go:517] "ResourceClaim waiting for first consumer" logger="resource controller" key="claim:gpu-test1/pod1-gpu-h7hr9"
I0415 08:55:27.568506       1 controller.go:377] "completed" logger="resource controller" key="claim:gpu-test1/pod1-gpu-h7hr9"
I0415 08:55:27.588070       1 controller.go:302] "Adding new work item" logger="resource controller" type="PodSchedulingContext" object="{\"metadata\":{\"name\":\"pod1\",\"namespace\":\"gpu-test1\",\"uid\":\"b52bbfa1-d9cd-4c11-aa4f-f4f4f4a42aff\",\"resourceVersion\":\"929186\",\"creationTimestamp\":\"2024-04-15T08:55:27Z\",\"ownerReferences\":[{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"name\":\"pod1\",\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\",\"controller\":true,\"blockOwnerDeletion\":true}],\"managedFields\":[{\"manager\":\"kube-scheduler\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:metadata\":{\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"024aa85b-c07a-4e26-b32d-5d05ae650519\\\"}\":{}}},\"f:spec\":{\"f:potentialNodes\":{},\"f:selectedNode\":{}}}}]},\"spec\":{\"selectedNode\":\"gpu2\",\"potentialNodes\":[\"gpu-master\",\"gpu2\",\"gpu3\"]},\"status\":{}}" key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:27.588162       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:27.592619       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod1 200 OK in 4 milliseconds
I0415 08:55:27.599347       1 round_trippers.go:553] GET https://10.96.0.1:443/version 200 OK in 1 milliseconds
I0415 08:55:27.605582       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 5 milliseconds
I0415 08:55:27.605868       1 controller.go:302] "Adding new work item" logger="resource controller" type="PodSchedulingContext" object="{\"metadata\":{\"name\":\"pod2\",\"namespace\":\"gpu-test1\",\"uid\":\"d4679e65-6a2b-4c25-a58c-bbae310cd358\",\"resourceVersion\":\"929189\",\"creationTimestamp\":\"2024-04-15T08:55:27Z\",\"ownerReferences\":[{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"name\":\"pod2\",\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\",\"controller\":true,\"blockOwnerDeletion\":true}],\"managedFields\":[{\"manager\":\"kube-scheduler\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:metadata\":{\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"1b090750-72aa-4352-b7ca-c03017c17bc6\\\"}\":{}}},\"f:spec\":{\"f:potentialNodes\":{},\"f:selectedNode\":{}}}}]},\"spec\":{\"selectedNode\":\"gpu3\",\"potentialNodes\":[\"gpu-master\",\"gpu2\",\"gpu3\"]},\"status\":{}}" key="schedulingCtx:gpu-test1/pod2"
I0415 08:55:27.605919       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:55:27.609081       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod2 200 OK in 3 milliseconds
I0415 08:55:27.611849       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 5 milliseconds
I0415 08:55:27.614863       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 5 milliseconds
I0415 08:55:27.619641       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 6 milliseconds
I0415 08:55:27.620825       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod1-gpu-h7hr9","generateName":"pod1-gpu-","namespace":"gpu-test1","uid":"c0b909c3-f033-460b-927d-1612a79fa8ab","resourceVersion":"929183","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod1","uid":"024aa85b-c07a-4e26-b32d-5d05ae650519","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu2"
I0415 08:55:27.620880       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" node="gpu2"
I0415 08:55:27.621087       1 controller.go:851] "Updating pod scheduling with modified unsuitable nodes" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" podSchedulingCtx="&PodSchedulingContext{ObjectMeta:{pod1  gpu-test1  b52bbfa1-d9cd-4c11-aa4f-f4f4f4a42aff 929186 0 2024-04-15 08:55:27 +0000 UTC <nil> <nil> map[] map[] [{v1 Pod pod1 024aa85b-c07a-4e26-b32d-5d05ae650519 0xc0005d5470 0xc0005d5471}] [] [{kube-scheduler Update resource.k8s.io/v1alpha2 2024-04-15 08:55:27 +0000 UTC FieldsV1 {\"f:metadata\":{\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"024aa85b-c07a-4e26-b32d-5d05ae650519\\\"}\":{}}},\"f:spec\":{\"f:potentialNodes\":{},\"f:selectedNode\":{}}} }]},Spec:PodSchedulingContextSpec{SelectedNode:gpu2,PotentialNodes:[gpu-master gpu2 gpu3],},Status:PodSchedulingContextStatus{ResourceClaims:[]ResourceClaimSchedulingStatus{ResourceClaimSchedulingStatus{Name:gpu,UnsuitableNodes:[gpu-master gpu2 gpu3],},},},}"
I0415 08:55:27.622008       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 6 milliseconds
I0415 08:55:27.625117       1 round_trippers.go:553] PUT https://10.96.0.1:443/apis/resource.k8s.io/v1alpha2/namespaces/gpu-test1/podschedulingcontexts/pod1/status 200 OK in 3 milliseconds
I0415 08:55:27.625367       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:27.626906       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 4 milliseconds
I0415 08:55:27.627580       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod2-gpu-99zjz","generateName":"pod2-gpu-","namespace":"gpu-test1","uid":"d34546e7-2cf4-4061-8e4b-e1837ff3abf4","resourceVersion":"929182","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod2","uid":"1b090750-72aa-4352-b7ca-c03017c17bc6","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu3"
I0415 08:55:27.627631       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" node="gpu3"
I0415 08:55:27.627733       1 controller.go:302] "Adding updated work item" logger="resource controller" type="PodSchedulingContext" object="{\"metadata\":{\"name\":\"pod1\",\"namespace\":\"gpu-test1\",\"uid\":\"b52bbfa1-d9cd-4c11-aa4f-f4f4f4a42aff\",\"resourceVersion\":\"929192\",\"creationTimestamp\":\"2024-04-15T08:55:27Z\",\"ownerReferences\":[{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"name\":\"pod1\",\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\",\"controller\":true,\"blockOwnerDeletion\":true}],\"managedFields\":[{\"manager\":\"kube-scheduler\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:metadata\":{\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"024aa85b-c07a-4e26-b32d-5d05ae650519\\\"}\":{}}},\"f:spec\":{\"f:potentialNodes\":{},\"f:selectedNode\":{}}}},{\"manager\":\"nvidia-dra-controller\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:status\":{\"f:resourceClaims\":{\".\":{},\"k:{\\\"name\\\":\\\"gpu\\\"}\":{\".\":{},\"f:name\":{},\"f:unsuitableNodes\":{}}}}},\"subresource\":\"status\"}]},\"spec\":{\"selectedNode\":\"gpu2\",\"potentialNodes\":[\"gpu-master\",\"gpu2\",\"gpu3\"]},\"status\":{\"resourceClaims\":[{\"name\":\"gpu\",\"unsuitableNodes\":[\"gpu-master\",\"gpu2\",\"gpu3\"]}]}}" diff=<
          &v1alpha2.PodSchedulingContext{
                TypeMeta: {},
                ObjectMeta: v1.ObjectMeta{
                        ... // 3 identical fields
                        SelfLink:          "",
                        UID:               "b52bbfa1-d9cd-4c11-aa4f-f4f4f4a42aff",
        -               ResourceVersion:   "929186",
        +               ResourceVersion:   "929192",
                        Generation:        0,
                        CreationTimestamp: {Time: s"2024-04-15 08:55:27 +0000 UTC"},
                        ... // 4 identical fields
                        OwnerReferences: {{APIVersion: "v1", Kind: "Pod", Name: "pod1", UID: "024aa85b-c07a-4e26-b32d-5d05ae650519", ...}},
                        Finalizers:      nil,
                        ManagedFields: []v1.ManagedFieldsEntry{
                                {Manager: "kube-scheduler", Operation: "Update", APIVersion: "resource.k8s.io/v1alpha2", Time: s"2024-04-15 08:55:27 +0000 UTC", ...},
        +                       {
        +                               Manager:     "nvidia-dra-controller",
        +                               Operation:   "Update",
        +                               APIVersion:  "resource.k8s.io/v1alpha2",
        +                               Time:        s"2024-04-15 08:55:27 +0000 UTC",
        +                               FieldsType:  "FieldsV1",
        +                               FieldsV1:    s`{"f:status":{"f:resourceClaims":{".":{},"k:{\"name\":\"gpu\"}":{".":{},"f:name":{},"f:unsuitableNodes":{}}}}}`,
        +                               Subresource: "status",
        +                       },
                        },
                },
                Spec:   {SelectedNode: "gpu2", PotentialNodes: {"gpu-master", "gpu2", "gpu3"}},
        -       Status: v1alpha2.PodSchedulingContextStatus{},
        +       Status: v1alpha2.PodSchedulingContextStatus{
        +               ResourceClaims: []v1alpha2.ResourceClaimSchedulingStatus{{Name: "gpu", UnsuitableNodes: []string{"gpu-master", "gpu2", "gpu3"}}},
        +       },
          }
 > key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:27.627769       1 controller.go:851] "Updating pod scheduling with modified unsuitable nodes" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" podSchedulingCtx="&PodSchedulingContext{ObjectMeta:{pod2  gpu-test1  d4679e65-6a2b-4c25-a58c-bbae310cd358 929189 0 2024-04-15 08:55:27 +0000 UTC <nil> <nil> map[] map[] [{v1 Pod pod2 1b090750-72aa-4352-b7ca-c03017c17bc6 0xc000544130 0xc000544131}] [] [{kube-scheduler Update resource.k8s.io/v1alpha2 2024-04-15 08:55:27 +0000 UTC FieldsV1 {\"f:metadata\":{\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"1b090750-72aa-4352-b7ca-c03017c17bc6\\\"}\":{}}},\"f:spec\":{\"f:potentialNodes\":{},\"f:selectedNode\":{}}} }]},Spec:PodSchedulingContextSpec{SelectedNode:gpu3,PotentialNodes:[gpu-master gpu2 gpu3],},Status:PodSchedulingContextStatus{ResourceClaims:[]ResourceClaimSchedulingStatus{ResourceClaimSchedulingStatus{Name:gpu,UnsuitableNodes:[gpu-master gpu2 gpu3],},},},}"
I0415 08:55:27.627805       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:27.631657       1 round_trippers.go:553] PUT https://10.96.0.1:443/apis/resource.k8s.io/v1alpha2/namespaces/gpu-test1/podschedulingcontexts/pod2/status 200 OK in 3 milliseconds
I0415 08:55:27.633188       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod1 200 OK in 5 milliseconds
I0415 08:55:27.634101       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:55:27.636443       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 2 milliseconds
I0415 08:55:27.636822       1 controller.go:302] "Adding updated work item" logger="resource controller" type="PodSchedulingContext" object="{\"metadata\":{\"name\":\"pod2\",\"namespace\":\"gpu-test1\",\"uid\":\"d4679e65-6a2b-4c25-a58c-bbae310cd358\",\"resourceVersion\":\"929193\",\"creationTimestamp\":\"2024-04-15T08:55:27Z\",\"ownerReferences\":[{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"name\":\"pod2\",\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\",\"controller\":true,\"blockOwnerDeletion\":true}],\"managedFields\":[{\"manager\":\"kube-scheduler\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:metadata\":{\"f:ownerReferences\":{\".\":{},\"k:{\\\"uid\\\":\\\"1b090750-72aa-4352-b7ca-c03017c17bc6\\\"}\":{}}},\"f:spec\":{\"f:potentialNodes\":{},\"f:selectedNode\":{}}}},{\"manager\":\"nvidia-dra-controller\",\"operation\":\"Update\",\"apiVersion\":\"resource.k8s.io/v1alpha2\",\"time\":\"2024-04-15T08:55:27Z\",\"fieldsType\":\"FieldsV1\",\"fieldsV1\":{\"f:status\":{\"f:resourceClaims\":{\".\":{},\"k:{\\\"name\\\":\\\"gpu\\\"}\":{\".\":{},\"f:name\":{},\"f:unsuitableNodes\":{}}}}},\"subresource\":\"status\"}]},\"spec\":{\"selectedNode\":\"gpu3\",\"potentialNodes\":[\"gpu-master\",\"gpu2\",\"gpu3\"]},\"status\":{\"resourceClaims\":[{\"name\":\"gpu\",\"unsuitableNodes\":[\"gpu-master\",\"gpu2\",\"gpu3\"]}]}}" diff=<
          &v1alpha2.PodSchedulingContext{
                TypeMeta: {},
                ObjectMeta: v1.ObjectMeta{
                        ... // 3 identical fields
                        SelfLink:          "",
                        UID:               "d4679e65-6a2b-4c25-a58c-bbae310cd358",
        -               ResourceVersion:   "929189",
        +               ResourceVersion:   "929193",
                        Generation:        0,
                        CreationTimestamp: {Time: s"2024-04-15 08:55:27 +0000 UTC"},
                        ... // 4 identical fields
                        OwnerReferences: {{APIVersion: "v1", Kind: "Pod", Name: "pod2", UID: "1b090750-72aa-4352-b7ca-c03017c17bc6", ...}},
                        Finalizers:      nil,
                        ManagedFields: []v1.ManagedFieldsEntry{
                                {Manager: "kube-scheduler", Operation: "Update", APIVersion: "resource.k8s.io/v1alpha2", Time: s"2024-04-15 08:55:27 +0000 UTC", ...},
        +                       {
        +                               Manager:     "nvidia-dra-controller",
        +                               Operation:   "Update",
        +                               APIVersion:  "resource.k8s.io/v1alpha2",
        +                               Time:        s"2024-04-15 08:55:27 +0000 UTC",
        +                               FieldsType:  "FieldsV1",
        +                               FieldsV1:    s`{"f:status":{"f:resourceClaims":{".":{},"k:{\"name\":\"gpu\"}":{".":{},"f:name":{},"f:unsuitableNodes":{}}}}}`,
        +                               Subresource: "status",
        +                       },
                        },
                },
                Spec:   {SelectedNode: "gpu3", PotentialNodes: {"gpu-master", "gpu2", "gpu3"}},
        -       Status: v1alpha2.PodSchedulingContextStatus{},
        +       Status: v1alpha2.PodSchedulingContextStatus{
        +               ResourceClaims: []v1alpha2.ResourceClaimSchedulingStatus{{Name: "gpu", UnsuitableNodes: []string{"gpu-master", "gpu2", "gpu3"}}},
        +       },
          }
 > key="schedulingCtx:gpu-test1/pod2"
I0415 08:55:27.636880       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:55:27.639643       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod2 200 OK in 2 milliseconds
I0415 08:55:27.641066       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 4 milliseconds
I0415 08:55:27.645926       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 5 milliseconds
I0415 08:55:27.649437       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 7 milliseconds
I0415 08:55:27.650187       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod1-gpu-h7hr9","generateName":"pod1-gpu-","namespace":"gpu-test1","uid":"c0b909c3-f033-460b-927d-1612a79fa8ab","resourceVersion":"929183","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod1","uid":"024aa85b-c07a-4e26-b32d-5d05ae650519","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu2"
I0415 08:55:27.650240       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" node="gpu2"
I0415 08:55:27.650273       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:27.800920       1 request.go:629] Waited for 154.560813ms due to client-side throttling, not priority and fairness, request: GET:https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2
I0415 08:55:27.807136       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 5 milliseconds
I0415 08:55:28.000763       1 request.go:629] Waited for 192.781479ms due to client-side throttling, not priority and fairness, request: GET:https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3
I0415 08:55:28.007269       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 6 milliseconds
I0415 08:55:28.008367       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod2-gpu-99zjz","generateName":"pod2-gpu-","namespace":"gpu-test1","uid":"d34546e7-2cf4-4061-8e4b-e1837ff3abf4","resourceVersion":"929182","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod2","uid":"1b090750-72aa-4352-b7ca-c03017c17bc6","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu3"
I0415 08:55:28.008455       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" node="gpu3"
I0415 08:55:28.008533       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:55:57.626522       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:57.632402       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod1 200 OK in 5 milliseconds
I0415 08:55:57.634748       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:55:57.636517       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 3 milliseconds
I0415 08:55:57.638464       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod2 200 OK in 3 milliseconds
I0415 08:55:57.641907       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 2 milliseconds
I0415 08:55:57.642886       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 6 milliseconds
I0415 08:55:57.648389       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 4 milliseconds
I0415 08:55:57.649308       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 5 milliseconds
I0415 08:55:57.649550       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod1-gpu-h7hr9","generateName":"pod1-gpu-","namespace":"gpu-test1","uid":"c0b909c3-f033-460b-927d-1612a79fa8ab","resourceVersion":"929183","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod1","uid":"024aa85b-c07a-4e26-b32d-5d05ae650519","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu2"
I0415 08:55:57.649638       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" node="gpu2"
I0415 08:55:57.649687       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:55:57.660120       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 9 milliseconds
I0415 08:55:57.660754       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod2-gpu-99zjz","generateName":"pod2-gpu-","namespace":"gpu-test1","uid":"d34546e7-2cf4-4061-8e4b-e1837ff3abf4","resourceVersion":"929182","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod2","uid":"1b090750-72aa-4352-b7ca-c03017c17bc6","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu3"
I0415 08:55:57.660789       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" node="gpu3"
I0415 08:55:57.660819       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:56:27.650837       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:56:27.657417       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod1 200 OK in 6 milliseconds
I0415 08:56:27.660475       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 2 milliseconds
I0415 08:56:27.661652       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:56:27.664824       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 4 milliseconds
I0415 08:56:27.665031       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod2 200 OK in 3 milliseconds
I0415 08:56:27.668577       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 2 milliseconds
I0415 08:56:27.671700       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 6 milliseconds
I0415 08:56:27.672257       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod1-gpu-h7hr9","generateName":"pod1-gpu-","namespace":"gpu-test1","uid":"c0b909c3-f033-460b-927d-1612a79fa8ab","resourceVersion":"929183","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod1","uid":"024aa85b-c07a-4e26-b32d-5d05ae650519","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu2"
I0415 08:56:27.672293       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" node="gpu2"
I0415 08:56:27.672311       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:56:27.673552       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 4 milliseconds
I0415 08:56:27.677424       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 3 milliseconds
I0415 08:56:27.678065       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod2-gpu-99zjz","generateName":"pod2-gpu-","namespace":"gpu-test1","uid":"d34546e7-2cf4-4061-8e4b-e1837ff3abf4","resourceVersion":"929182","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod2","uid":"1b090750-72aa-4352-b7ca-c03017c17bc6","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu3"
I0415 08:56:27.678116       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" node="gpu3"
I0415 08:56:27.678147       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:56:57.673622       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:56:57.678882       1 controller.go:373] "processing" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"
I0415 08:56:57.679164       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod1 200 OK in 5 milliseconds
I0415 08:56:57.682129       1 round_trippers.go:553] GET https://10.96.0.1:443/api/v1/namespaces/gpu-test1/pods/pod2 200 OK in 3 milliseconds
I0415 08:56:57.683684       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 3 milliseconds
I0415 08:56:57.686815       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu-master 404 Not Found in 2 milliseconds
I0415 08:56:57.689502       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 5 milliseconds
I0415 08:56:57.695452       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 4 milliseconds
I0415 08:56:57.695767       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu2 200 OK in 5 milliseconds
I0415 08:56:57.696380       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod1-gpu-h7hr9","generateName":"pod1-gpu-","namespace":"gpu-test1","uid":"c0b909c3-f033-460b-927d-1612a79fa8ab","resourceVersion":"929183","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod1","uid":"024aa85b-c07a-4e26-b32d-5d05ae650519","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"024aa85b-c07a-4e26-b32d-5d05ae650519\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu2"
I0415 08:56:57.696443       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod1" node="gpu2"
I0415 08:56:57.696469       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod1"
I0415 08:56:57.701443       1 round_trippers.go:553] GET https://10.96.0.1:443/apis/nas.gpu.resource.nvidia.com/v1alpha1/namespaces/nvidia-dra-driver/nodeallocationstates/gpu3 200 OK in 4 milliseconds
I0415 08:56:57.706155       1 controller.go:788] "pending pod claims" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" claims=[{"PodClaimName":"gpu","Claim":{"metadata":{"name":"pod2-gpu-99zjz","generateName":"pod2-gpu-","namespace":"gpu-test1","uid":"d34546e7-2cf4-4061-8e4b-e1837ff3abf4","resourceVersion":"929182","creationTimestamp":"2024-04-15T08:55:27Z","annotations":{"resource.kubernetes.io/pod-claim-name":"gpu"},"ownerReferences":[{"apiVersion":"v1","kind":"Pod","name":"pod2","uid":"1b090750-72aa-4352-b7ca-c03017c17bc6","controller":true,"blockOwnerDeletion":true}],"managedFields":[{"manager":"kube-controller-manager","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:55:27Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:resource.kubernetes.io/pod-claim-name":{}},"f:generateName":{},"f:ownerReferences":{".":{},"k:{\"uid\":\"1b090750-72aa-4352-b7ca-c03017c17bc6\"}":{}}},"f:spec":{"f:allocationMode":{},"f:resourceClassName":{}}}}]},"spec":{"resourceClassName":"gpu.nvidia.com","allocationMode":"WaitForFirstConsumer"},"status":{}},"Class":{"metadata":{"name":"gpu.nvidia.com","uid":"8f15e155-a794-4369-a31c-214b81ecf539","resourceVersion":"925634","creationTimestamp":"2024-04-15T08:28:28Z","labels":{"app.kubernetes.io/managed-by":"Helm"},"annotations":{"meta.helm.sh/release-name":"nvidia","meta.helm.sh/release-namespace":"nvidia-dra-driver"},"managedFields":[{"manager":"helm","operation":"Update","apiVersion":"resource.k8s.io/v1alpha2","time":"2024-04-15T08:28:28Z","fieldsType":"FieldsV1","fieldsV1":{"f:driverName":{},"f:metadata":{"f:annotations":{".":{},"f:meta.helm.sh/release-name":{},"f:meta.helm.sh/release-namespace":{}},"f:labels":{".":{},"f:app.kubernetes.io/managed-by":{}}}}}]},"driverName":"gpu.resource.nvidia.com"},"ClaimParameters":{"count":1},"ClassParameters":{"sharable":true},"UnsuitableNodes":["gpu-master","gpu2","gpu3"],"Allocation":null,"Error":null}] selectedNode="gpu3"
I0415 08:56:57.706263       1 controller.go:799] "skipping allocation for unsuitable selected node" logger="resource controller" key="schedulingCtx:gpu-test1/pod2" node="gpu3"
I0415 08:56:57.706312       1 controller.go:383] "recheck periodically" logger="resource controller" key="schedulingCtx:gpu-test1/pod2"