NVIDIA / gpu-operator

NVIDIA GPU Operator creates/configures/manages GPUs atop Kubernetes
Apache License 2.0
1.75k stars 282 forks source link

Cannot find nvidia-smi in $PATH in toolkit-validation #226

Open mastier opened 3 years ago

mastier commented 3 years ago

The following installation will fail with "Cannot find nvidia-smi in $PATH"

helm install -n gpu-operator gpu-operator nvidia/gpu-operator --version=v1.7.1 --set driver.version=460.32.03  --set toolkit.version=1.5.0-ubuntu18.04   --set operator.defaultRuntime=containerd   --set toolkit.env[0].name=CONTAINERD_CONFIG   --set toolkit.env[0].value=/etc/containerd/config.toml  --set toolkit.env[1].name=CONTAINERD_SOCKET   --set toolkit.env[1].value=/run/containerd/containerd.sock  --set toolkit.env[2].name=WITH_WAIT  --set-string toolkit.env[2].value=true
 root@kubernetes-master-1:~# kubectl get po -A
NAMESPACE                         NAME                                                          READY   STATUS                  RESTARTS   AGE
default                           csi-rbdplugin-ds4h9                                           3/3     Running                 0          5d22h
default                           csi-rbdplugin-g7t66                                           3/3     Running                 0          5d22h
default                           csi-rbdplugin-gxxf9                                           3/3     Running                 3          5d22h
default                           csi-rbdplugin-j2r5d                                           3/3     Running                 0          5d22h
default                           csi-rbdplugin-provisioner-549c6b54c6-f4dm5                    6/6     Running                 56         5d23h
default                           csi-rbdplugin-provisioner-549c6b54c6-mjbxh                    6/6     Running                 41         5d23h
default                           csi-rbdplugin-provisioner-549c6b54c6-mv4t4                    6/6     Running                 82         5d23h
gpu-operator-resources            gpu-feature-discovery-7lhw8                                   0/1     Init:0/1                0          11m
gpu-operator-resources            gpu-feature-discovery-dn6pg                                   0/1     Init:0/1                0          11m
gpu-operator-resources            nvidia-container-toolkit-daemonset-qwk9r                      1/1     Running                 0          11m
gpu-operator-resources            nvidia-container-toolkit-daemonset-w9mtf                      1/1     Running                 0          11m
gpu-operator-resources            nvidia-dcgm-exporter-54vwm                                    0/1     Init:0/1                0          11m
gpu-operator-resources            nvidia-dcgm-exporter-rqvws                                    0/1     Init:0/1                0          11m
gpu-operator-resources            nvidia-device-plugin-daemonset-cdl2k                          0/1     Init:0/1                0          11m
gpu-operator-resources            nvidia-device-plugin-daemonset-pc6f9                          0/1     Init:0/1                0          11m
gpu-operator-resources            nvidia-driver-daemonset-czvfn                                 1/1     Running                 0          11m
gpu-operator-resources            nvidia-driver-daemonset-mzj7g                                 1/1     Running                 0          11m
gpu-operator-resources            nvidia-operator-validator-n48xq                               0/1     Init:CrashLoopBackOff   6          11m
gpu-operator-resources            nvidia-operator-validator-tv4gq                               0/1     Init:CrashLoopBackOff   6          11m
gpu-operator                      gpu-operator-6b5666bb8b-hrsk5                                 1/1     Running                 0          11m
gpu-operator                      gpu-operator-node-feature-discovery-master-58d884d5cc-vlf54   1/1     Running                 0          11m
gpu-operator                      gpu-operator-node-feature-discovery-worker-2lpdc              1/1     Running                 0          11m
gpu-operator                      gpu-operator-node-feature-discovery-worker-8m78m              1/1     Running                 0          11m
gpu-operator                      gpu-operator-node-feature-discovery-worker-h77md              1/1     Running                 0          11m
gpu-operator                      gpu-operator-node-feature-discovery-worker-jpxkk              1/1     Running                 0          11m
ingress-nginx-kubernetes-worker   default-http-backend-kubernetes-worker-cd9b77777-bwpqk        1/1     Running                 34         5d22h
ingress-nginx-kubernetes-worker   nginx-ingress-controller-kubernetes-worker-6vrkp              1/1     Running                 0          5d22h
ingress-nginx-kubernetes-worker   nginx-ingress-controller-kubernetes-worker-j2hsh              1/1     Running                 1          5d22h
ingress-nginx-kubernetes-worker   nginx-ingress-controller-kubernetes-worker-jjv2b              1/1     Running                 0          5d22h
ingress-nginx-kubernetes-worker   nginx-ingress-controller-kubernetes-worker-nj65t              1/1     Running                 0          5d22h
kube-system                       calico-kube-controllers-575b755f5b-kf2tq                      1/1     Running                 0          5d23h
kube-system                       coredns-6f867cd986-x664l                                      1/1     Running                 0          5d23h
kube-system                       kube-state-metrics-7799879d89-4wnqn                           1/1     Running                 0          5d23h
kube-system                       metrics-server-v0.3.6-f6cf867b4-4z97c                         2/2     Running                 0          5d10h
kubernetes-dashboard              dashboard-metrics-scraper-8458d7fdf6-vhldv                    1/1     Running                 0          5d23h
kubernetes-dashboard              kubernetes-dashboard-5784589f96-cct2m                         1/1     Running                 2          5d23h
root@kubernetes-master-1:~# kubectl logs -n gpu-operator-resources -f nvidia-operator-validator-n48xq -c toolkit-validation
toolkit is not ready
time="2021-07-21T17:59:17Z" level=info msg="Error: error validating toolkit installation: exec: \"nvidia-smi\": executable file not found in $PATH"

I have two possible suspects Here's chroot the nvidia-smi command, but it the actual /run/nvidia/driver is not mounted in the InitContainer toolkit-validation https://github.com/NVIDIA/gpu-operator/blob/0a4bfd2119a98dca38cff0ba900ad38417b4a85b/validator/main.go#L384-L394

seems like the this container doesn't mount /run/nvidia/driver and the chroot /run/nvidia/driver nvidia-smi in main.go code seems incorrect

  toolkit-validation:
    Container ID:  containerd://56783b834b1ab48ca3652d8890db1e7c26285207f8f903d6209c49be5c733158
    Image:         nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.7.1
    Image ID:      nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:aa1f7bd526ae132c46f3ebe6ecfabe675889e240776ccc2155e31e0c48cc659e
    Port:          <none>
    Host Port:     <none>
    Command:
      sh
      -c
    Args:
      nvidia-validator
    State:          Waiting
      Reason:       CrashLoopBackOff
    Last State:     Terminated
      Reason:       Error
      Exit Code:    1
      Started:      Wed, 21 Jul 2021 18:19:45 +0000
      Finished:     Wed, 21 Jul 2021 18:19:45 +0000
    Ready:          False
    Restart Count:  9
    Environment:
      WITH_WAIT:  false
      COMPONENT:  toolkit
    Mounts:
      /run/nvidia/validations from run-nvidia-validations (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-t2vbd (ro)

List of events:

Events:
  Type     Reason                  Age                   From               Message
  ----     ------                  ----                  ----               -------
  Normal   Scheduled               19m                   default-scheduler  Successfully assigned gpu-operator-resources/nvidia-operator-validator-n48xq to hpc-k8s-phy-wrk-g01
  Warning  FailedCreatePodSandBox  19m                   kubelet            Failed to create pod sandbox: rpc error: code = Unknown desc = failed to get sandbox runtime: no runtime for "nvidia" is configured
  Warning  FailedCreatePodSandBox  19m                   kubelet            Failed to create pod sandbox: rpc error: code = Unavailable desc = connection error: desc = "transport: Error while dialing dial unix /var/run/containerd/containerd.sock: connect: connection refused"
  Normal   Pulled                  19m                   kubelet            Container image "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.7.1" already present on machine
  Normal   Created                 19m                   kubelet            Created container driver-validation
  Normal   Started                 19m                   kubelet            Started container driver-validation
  Normal   Created                 17m (x4 over 17m)     kubelet            Created container toolkit-validation
  Normal   Started                 17m (x4 over 17m)     kubelet            Started container toolkit-validation
  Normal   Pulled                  16m (x5 over 17m)     kubelet            Container image "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.7.1" already present on machine
  Warning  BackOff                 4m44s (x62 over 17m)  kubelet            Back-off restarting failed container
elezar commented 3 years ago

@mastier I have seen something like this in testing before. There seems to be some kind of race condition on the /run/nvidia/driver mount shared between the driver container and the validator pod. I found that deleting the validator pod and allowing it to be recreated generally mounts the populated /run/nvidia/driver mount into these containers and allows validation to continue.

mastier commented 3 years ago

@elezar thanks for that feedback. However after I delete that pod it is in Init:CrashLoopBackOff so it restart automatically the mounts are still the same

root@kubernetes-master-1:~# kubectl delete po -n gpu-operator-resources nvidia-operator-validator-n48xq
pod "nvidia-operator-validator-n48xq" deleted

root@kubernetes-master-1:~# kubectl delete po -n gpu-operator-resources nvidia-operator-validator-tv4gq
pod "nvidia-operator-validator-tv4gq" deleted

for toolkit-validation kubectl describe po -n gpu-operator-resources nvidia-operator-validator-n48xq:

...
      VALIDATOR_RUNTIMECLASS:       nvidia
    Mounts:
      /run/nvidia/validations from run-nvidia-validations (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-rs5xl (ro)

These mounts are in the daemonset ,so I tried editing them root@kubernetes-master-1:~# kubectl edit daemonsets nvidia-operator-validator -n gpu-operator-resources by adding to the container:

      - args:
        - nvidia-validator
        command:
        - sh
        - -c
        env:
        - name: WITH_WAIT
          value: "false"
        - name: COMPONENT
          value: toolkit
        image: nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.7.1
        imagePullPolicy: IfNotPresent
        name: toolkit-validation
        resources: {}
        securityContext:
          privileged: true
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /run/nvidia/validations
          mountPropagation: Bidirectional
          name: run-nvidia-validations

additional mount like for driver-validation container

        - mountPath: /run/nvidia/driver
          mountPropagation: Bidirectional
          name: driver-install-path

removed the pod again, the mounts are there but same failure kubectl describe po -n gpu-operator-resources nvidia-operator-validator-6hl24:

  toolkit-validation:
    Container ID:  containerd://5e1ae908d770c409130c4d6d81fe10841867e935f21e4ea877fd909d11a16ced
    Image:         nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.7.1
    Image ID:      nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:aa1f7bd526ae132c46f3ebe6ecfabe675889e240776ccc2155e31e0c48cc659e
    Port:          <none>
    Host Port:     <none>
    Command:
      sh
      -c
    Args:
      nvidia-validator
    State:          Waiting
      Reason:       CrashLoopBackOff
    Last State:     Terminated
      Reason:       Error
      Exit Code:    1
      Started:      Wed, 21 Jul 2021 21:59:52 +0000
      Finished:     Wed, 21 Jul 2021 21:59:52 +0000
    Ready:          False
    Restart Count:  2
    Environment:
      WITH_WAIT:  false
      COMPONENT:  toolkit
    Mounts:
      /run/nvidia/driver from driver-install-path (rw)
      /run/nvidia/validations from run-nvidia-validations (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-hkm9t (ro)
root@kubernetes-master-1:~# kubectl logs -n gpu-operator-resources -f nvidia-operator-validator-6hl24 -c toolkit-validation
toolkit is not ready
time="2021-07-21T22:02:24Z" level=info msg="Error: error validating toolkit installation: exec: \"nvidia-smi\": executable file not found in $PATH"

I have so it's not only that. I see that validation part in main.go in nvidia-validator trying 'chroot /run/nvidia/driver nvidia-smi' In my opinion that chroot will fail always, unless the PATH container '.' dir or something like that. I am puzzled.

shivamerla commented 3 years ago

@mastier toolkit validation doesn't use "chroot", but directly invokes nvidia-smi as we expect toolkit to inject these files automatically. Hence mount of /run/nvidia/driver is not required for this container. Code is here: https://github.com/NVIDIA/gpu-operator/blob/master/validator/main.go#L436

I think the toolkit is not functioning as expected to inject these files. Can you add "debug" fields to toolkit config at /usr/local/nvidia/toolkit/.config/nvidia-container-runtime/config.toml, restart validator pod and attach those log files?

disable-require = false

[nvidia-container-cli]
  debug = "/var/log/nvidia-container-toolkit.log"
  environment = []
  ldconfig = "@/run/nvidia/driver/sbin/ldconfig.real"
  load-kmods = true
  path = "/usr/local/nvidia/toolkit/nvidia-container-cli"
  root = "/run/nvidia/driver"

[nvidia-container-runtime]
  debug = "/var/log/nvidia-container-runtime.log"
mastier commented 3 years ago

@shivamerla ok, thank you, you are right about the code in main.go. How can I add the stuff to the toolkit ? The container is dead. Pod is restarting it. How do you expect me to test it ? run toolkit container manually ?

root@kubernetes-master-1:~# kubectl run -it --rm    --image=nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.7.1 --restart=Never -- sh
If you don't see a command prompt, try pressing enter.
[root@sh /]# which nvidia-validator
/usr/bin/nvidia-validator
[root@sh /]# dpkg -S nvidia-validator
bash: dpkg: command not found
[root@sh /]# ls -la /usr/local/nvidia
ls: cannot access '/usr/local/nvidia': No such file or directory
[root@sh /]#
shivamerla commented 3 years ago

If you add the debug fields i mentioned earlier with toolkit config file, we should see those logs under "/var/log". Also, can you attach /etc/containerd/config.toml as well.

aym-frikha commented 3 years ago

Hello @shivamerla , I did what you mentioned in your last comment, but the /var/log/nvidia-container-toolkit.log file and "/var/log/nvidia-container-runtime.log" are empty / not created. You can find attached the /etc/containerd/config.toml. config.txt

aym-frikha commented 3 years ago

This is the toolkit daemonset logs I have in the environment: toolkit-daemonset-logs.txt

shivamerla commented 3 years ago

@aym-frikha after applying the change, can you delete gpu-operator-validator pod, so that when it tries to run again it will generate logs by toolkit. Also, please don't delete the toolkit container after the change as it will revert that again. Also, can you run "sudo chroot /run/nvidia/driver nvidia-smi" from the worker node to see if all devices were initialized correctly by the driver. "/var/log/syslog" will have any errors during initialization.

kpouget commented 3 years ago

can you delete gpu-operator-validator pod

not the Pod but the DaemonSet ;-)

On Tue, Aug 3, 2021 at 7:08 PM Shiva Krishna Merla @.***> wrote:

@aym-frikha https://github.com/aym-frikha after applying the change, can you delete gpu-operator-validator pod, so that when it tries to run again it will generate logs by toolkit. Also, please don't delete the toolkit container after the change as it will revert that again.

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/NVIDIA/gpu-operator/issues/226#issuecomment-892014987, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABZVQIR5KILCB44ULQSMSSLT3APA7ANCNFSM5AYPRWSA . Triage notifications on the go with GitHub Mobile for iOS https://apps.apple.com/app/apple-store/id1477376905?ct=notification-email&mt=8&pt=524675 or Android https://play.google.com/store/apps/details?id=com.github.android&utm_campaign=notification-email .

shivamerla commented 3 years ago

@kpouget we just need to restart the container so that toolkit tries to inject files again. So just pod delete should do it.

asviel commented 2 years ago

If you add the debug fields i mentioned earlier with toolkit config file, we should see those logs under "/var/log". Also, can you attach /etc/containerd/config.toml as well.

I encountered the same problem. Went through this issue, trying to figure out how to see the debug logs?

As far as I understand, they should be under /var/log in Pod nvidia-operator-validator in the toolkit-validation container, but since it all goes to CrashLoopBackOff, I can't (don't have time) do exec and see what is in the logs.

Can you helm me @shivamerla?

UPDATED: I had no problem installing Chart 1.7.1 (driver 460.73.01). GPU - T4.

shivamerla commented 2 years ago

@asviel If you add debug fields as mention in comment https://github.com/NVIDIA/gpu-operator/issues/226#issuecomment-884535327 , you should see logs under /var/log on the host itself. Can you please share the install command or clusterpolicy spec? also logs form container-toolkit pod will help as well.

patrickshan commented 2 years ago

@shivamerla I had the same issue with that nvidia-smi binary running inside toolkit-validation init container. I also tried the config change you mentioned here which didn't generate any logs under /var/log/.

We installed it using helm with these parameters:

 helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --set psp.enabled=true --set toolkit.version=v1.8.0-ubuntu18.04 --set operator.defaultRuntime=containerd

We are running this on AWS EKS v1.21.5-eks-bc4871b with node OS using ubuntu 20.04 and this is the cluster policy we are using:

apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
  annotations:
    meta.helm.sh/release-name: gpu-operator-1644198634
    meta.helm.sh/release-namespace: gpu-operator
  labels:
    app.kubernetes.io/component: gpu-operator
    app.kubernetes.io/managed-by: Helm
  name: cluster-policy
spec:
  daemonsets:
    priorityClassName: system-node-critical
    tolerations:
    - effect: NoSchedule
      key: nvidia.com/gpu
      operator: Exists
  dcgm:
    enabled: false
    hostPort: 5555
    image: dcgm
    imagePullPolicy: IfNotPresent
    repository: nvcr.io/nvidia/cloud-native
    version: 2.3.1-ubuntu20.04
  dcgmExporter:
    env:
    - name: DCGM_EXPORTER_LISTEN
      value: :9400
    - name: DCGM_EXPORTER_KUBERNETES
      value: "true"
    - name: DCGM_EXPORTER_COLLECTORS
      value: /etc/dcgm-exporter/dcp-metrics-included.csv
    image: dcgm-exporter
    imagePullPolicy: IfNotPresent
    repository: nvcr.io/nvidia/k8s
    version: 2.3.1-2.6.1-ubuntu20.04
  devicePlugin:
    env:
    - name: PASS_DEVICE_SPECS
      value: "true"
    - name: FAIL_ON_INIT_ERROR
      value: "true"
    - name: DEVICE_LIST_STRATEGY
      value: envvar
    - name: DEVICE_ID_STRATEGY
      value: uuid
    - name: NVIDIA_VISIBLE_DEVICES
      value: all
    - name: NVIDIA_DRIVER_CAPABILITIES
      value: all
    image: k8s-device-plugin
    imagePullPolicy: IfNotPresent
    repository: nvcr.io/nvidia
    securityContext:
      privileged: true
    version: v0.10.0-ubi8
  driver:
    certConfig:
      name: ""
    enabled: true
    image: driver
    imagePullPolicy: IfNotPresent
    licensingConfig:
      configMapName: ""
      nlsEnabled: false
    manager:
      env:
      - name: ENABLE_AUTO_DRAIN
        value: "true"
      - name: DRAIN_USE_FORCE
        value: "false"
      - name: DRAIN_POD_SELECTOR_LABEL
        value: ""
      - name: DRAIN_TIMEOUT_SECONDS
        value: 0s
      - name: DRAIN_DELETE_EMPTYDIR_DATA
        value: "false"
      image: k8s-driver-manager
      imagePullPolicy: IfNotPresent
      repository: nvcr.io/nvidia/cloud-native
      version: v0.2.0
    rdma:
      enabled: false
      useHostMofed: false
    repoConfig:
      configMapName: ""
    repository: nvcr.io/nvidia
    securityContext:
      privileged: true
      seLinuxOptions:
        level: s0
    version: 470.82.01
    virtualTopology:
      config: ""
  gfd:
    env:
    - name: GFD_SLEEP_INTERVAL
      value: 60s
    - name: GFD_FAIL_ON_INIT_ERROR
      value: "true"
    image: gpu-feature-discovery
    imagePullPolicy: IfNotPresent
    repository: nvcr.io/nvidia
    version: v0.4.1
  mig:
    strategy: single
  migManager:
    config:
      name: ""
    enabled: true
    env:
    - name: WITH_REBOOT
      value: "false"
    gpuClientsConfig:
      name: ""
    image: k8s-mig-manager
    imagePullPolicy: IfNotPresent
    repository: nvcr.io/nvidia/cloud-native
    securityContext:
      privileged: true
    version: v0.2.0-ubuntu20.04
  nodeStatusExporter:
    enabled: false
    image: gpu-operator-validator
    imagePullPolicy: IfNotPresent
    repository: nvcr.io/nvidia/cloud-native
    version: v1.9.1
  operator:
    defaultRuntime: docker
    initContainer:
      image: cuda
      imagePullPolicy: IfNotPresent
      repository: nvcr.io/nvidia
      version: 11.4.2-base-ubi8
    runtimeClass: nvidia
  psp:
    enabled: false
  toolkit:
    enabled: true
    image: container-toolkit
    imagePullPolicy: IfNotPresent
    repository: nvcr.io/nvidia/k8s
    securityContext:
      privileged: true
      seLinuxOptions:
        level: s0
    version: v1.8.0-ubuntu18.04
  validator:
    image: gpu-operator-validator
    imagePullPolicy: IfNotPresent
    plugin:
      env:
      - name: WITH_WORKLOAD
        value: "true"
    repository: nvcr.io/nvidia/cloud-native
    securityContext:
      privileged: true
      seLinuxOptions:
        level: s0
    version: v1.9.1