NVIDIA / k8s-device-plugin

NVIDIA device plugin for Kubernetes
Apache License 2.0
2.73k stars 615 forks source link

[k0s] `libnvidia-ml.so.1` missing in the pod #826

Closed EKami closed 2 months ago

EKami commented 2 months ago

1. Quick Debug Information

2. Issue or feature description

I'm having issues installing the plugin on my k0s node and would appreciate any guidance as to what I might be doing wrong. It boils down to this error: Detected non-NVML platform: could not load NVML library: libnvidia-ml.so.1: cannot open shared object file: No such file or directory from my nvdp-nvidia-device-plugin pod in my cluster. Here are the steps I followed for the installation of the plugin on my GPU node: containerd.toml:

version = 2

[plugins]

  [plugins."io.containerd.grpc.v1.cri"]
    cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
    device_ownership_from_security_context = false
    disable_apparmor = false
    disable_cgroup = false
    disable_hugetlb_controller = true
    disable_proc_mount = false
    disable_tcp_service = true
    drain_exec_sync_io_timeout = "0s"
    enable_cdi = false
    enable_selinux = false
    enable_tls_streaming = false
    enable_unprivileged_icmp = false
    enable_unprivileged_ports = false
    ignore_deprecation_warnings = []
    ignore_image_defined_volumes = false
    image_pull_progress_timeout = "5m0s"
    image_pull_with_sync_fs = false
    max_concurrent_downloads = 3
    max_container_log_line_size = 16384
    netns_mounts_under_state_dir = false
    restrict_oom_score_adj = false
    sandbox_image = "registry.k8s.io/pause:3.9"
    selinux_category_range = 1024
    stats_collect_period = 10
    stream_idle_timeout = "4h0m0s"
    stream_server_address = "127.0.0.1"
    stream_server_port = "0"
    systemd_cgroup = false
    tolerate_missing_hugetlb_controller = true
    unset_seccomp_profile = ""

    [plugins."io.containerd.grpc.v1.cri".cni]
      bin_dir = "/opt/cni/bin"
      conf_dir = "/etc/cni/net.d"
      conf_template = ""
      ip_pref = ""
      max_conf_num = 1
      setup_serially = false

    [plugins."io.containerd.grpc.v1.cri".containerd]
      default_runtime_name = "runc"
      disable_snapshot_annotations = true
      discard_unpacked_layers = false
      ignore_blockio_not_enabled_errors = false
      ignore_rdt_not_enabled_errors = false
      no_pivot = false
      snapshotter = "overlayfs"

      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        privileged_without_host_devices_all_devices_allowed = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""
        sandbox_mode = ""
        snapshotter = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
          base_runtime_spec = ""
          cni_conf_dir = ""
          cni_max_conf_num = 0
          container_annotations = []
          pod_annotations = []
          privileged_without_host_devices = false
          privileged_without_host_devices_all_devices_allowed = false
          runtime_engine = ""
          runtime_path = ""
          runtime_root = ""
          runtime_type = "io.containerd.runc.v2"
          sandbox_mode = "podsandbox"
          snapshotter = ""

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
            BinaryName = "/usr/bin/nvidia-container-runtime"
            CriuImagePath = ""
            CriuPath = ""
            CriuWorkPath = ""
            IoGid = 0
            IoUid = 0
            NoNewKeyring = false
            NoPivotRoot = false
            Root = ""
            ShimCgroup = ""
            SystemdCgroup = false

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          base_runtime_spec = ""
          cni_conf_dir = ""
          cni_max_conf_num = 0
          container_annotations = []
          pod_annotations = []
          privileged_without_host_devices = false
          privileged_without_host_devices_all_devices_allowed = false
          runtime_engine = ""
          runtime_path = ""
          runtime_root = ""
          runtime_type = "io.containerd.runc.v2"
          sandbox_mode = "podsandbox"
          snapshotter = ""

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
            BinaryName = ""
            CriuImagePath = ""
            CriuPath = ""
            CriuWorkPath = ""
            IoGid = 0
            IoUid = 0
            NoNewKeyring = false
            NoPivotRoot = false
            Root = ""
            ShimCgroup = ""
            SystemdCgroup = false

      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        privileged_without_host_devices_all_devices_allowed = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""
        sandbox_mode = ""
        snapshotter = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]

    [plugins."io.containerd.grpc.v1.cri".image_decryption]
      key_model = "node"

    [plugins."io.containerd.grpc.v1.cri".registry]
      config_path = ""

      [plugins."io.containerd.grpc.v1.cri".registry.auths]

      [plugins."io.containerd.grpc.v1.cri".registry.configs]

      [plugins."io.containerd.grpc.v1.cri".registry.headers]

      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]

    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
      tls_cert_file = ""
      tls_key_file = ""

With this configuration (located in /etc/k0s/containerd.toml for k0s) I've started my node with this:

k0s install worker --taints nvidia.com/gpu=true:NoSchedule --labels "gpu-memory-MiB=15360,nvidia.com/gpu.present=true" --token-file /etc/k0s/worker-token

Then I've installed the plugin with:

helm upgrade -i nvdp nvdp/nvidia-device-plugin \
    --version=0.15.1 \
    --namespace nvidia-device-plugin \
    --create-namespace \
    --set-file config.map.config=/tmp/nvidia-config.yaml

where /tmp/nvidia-config.yaml content is:

version: v1
flags:
  migStrategy: "none"
  failOnInitError: true
  nvidiaDriverRoot: "/"
  plugin:
    passDeviceSpecs: false
    deviceListStrategy: envvar
    deviceIDStrategy: uuid

After installing with helm, my pod is errored out:

❯ k get pods -n nvidia-device-plugin
NAME                              READY   STATUS   RESTARTS      AGE
nvdp-nvidia-device-plugin-rkq99   1/2     Error    1 (16s ago)   19s

After getting into more details, I get:

❯ kubectl logs nvdp-nvidia-device-plugin-rkq99 -n nvidia-device-plugin -c nvidia-device-plugin-ctr
I0716 09:55:45.480204      85 main.go:178] Starting FS watcher.
I0716 09:55:45.480277      85 main.go:185] Starting OS watcher.
I0716 09:55:45.480600      85 main.go:200] Starting Plugins.
I0716 09:55:45.480637      85 main.go:257] Loading configuration.
I0716 09:55:45.481464      85 main.go:265] Updating config with default resource matching patterns.
I0716 09:55:45.481525      85 main.go:276]
Running with config:
{
  "version": "v1",
  "flags": {
    "migStrategy": "none",
    "failOnInitError": true,
    "mpsRoot": "/run/nvidia/mps",
    "nvidiaDriverRoot": "/",
    "gdsEnabled": false,
    "mofedEnabled": false,
    "useNodeFeatureAPI": null,
    "plugin": {
      "passDeviceSpecs": false,
      "deviceListStrategy": [
        "envvar"
      ],
      "deviceIDStrategy": "uuid",
      "cdiAnnotationPrefix": "cdi.k8s.io/",
      "nvidiaCTKPath": "/usr/bin/nvidia-ctk",
      "containerDriverRoot": "/driver-root"
    }
  },
  "resources": {
    "gpus": [
      {
        "pattern": "*",
        "name": "nvidia.com/gpu"
      }
    ]
  },
  "sharing": {
    "timeSlicing": {}
  }
}
I0716 09:55:45.481539      85 main.go:279] Retrieving plugins.
W0716 09:55:45.481601      85 factory.go:31] No valid resources detected, creating a null CDI handler
I0716 09:55:45.481643      85 factory.go:104] Detected non-NVML platform: could not load NVML library: libnvidia-ml.so.1: cannot open shared object file: No such file or directory
I0716 09:55:45.481676      85 factory.go:104] Detected non-Tegra platform: /sys/devices/soc0/family file not found
E0716 09:55:45.481686      85 factory.go:112] Incompatible platform detected
E0716 09:55:45.481689      85 factory.go:113] If this is a GPU node, did you configure the NVIDIA Container Toolkit?
E0716 09:55:45.481695      85 factory.go:114] You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
E0716 09:55:45.481701      85 factory.go:115] You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
E0716 09:55:45.481706      85 factory.go:116] If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
E0716 09:55:45.481812      85 main.go:132] error starting plugins: error creating plugin manager: unable to create plugin manager: platform detection failed

I've been trying for days to solve this issue. I think the embedded containerd in k0s is properly configured as I'm able to run a GPU job from it:

$ sudo k0s ctr images pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2
nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2:                                resolved
elapsed: 0.5 s                                                                    total:   0.0 B (0.0 B/s)
unpacking linux/amd64 sha256:4593078cdb8e786d35566faa2b84da1123acea42f0d4099e84e2af0448724af1...
done: 13.07654ms
$ sudo k0s ctr run --gpus 0 --rm -t nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 vectoradd
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done

The only part that I'm missing is this libnvidia-ml.so.1. I'm not sure how to solve this one.

3. Information to attach (optional if deemed irrelevant)

Common error checking:

==============NVSMI LOG==============

Timestamp : Tue Jul 16 09:57:31 2024 Driver Version : 555.52.04 CUDA Version : 12.5

Attached GPUs : 1 GPU 00000000:00:1E.0 Product Name : Tesla T4 Product Brand : NVIDIA Product Architecture : Turing Display Mode : Enabled Display Active : Disabled Persistence Mode : Disabled Addressing Mode : None ...

 - [x] The k8s-device-plugin container logs
Provided above
 - [x] NVIDIA packages version from `dpkg -l '*nvidia*'` _or_ `rpm -qa '*nvidia*'`

$ dpkg -l 'nvidia'_or_rpm -qa 'nvidia' or: command not found dpkg-query: no packages found matching nvidiarpm dpkg-query: no packages found matching -qa Desired=Unknown/Install/Remove/Purge/Hold | Status=Not/Inst/Conf-files/Unpacked/halF-conf/Half-inst/trig-aWait/Trig-pend |/ Err?=(none)/Reinst-required (Status,Err: uppercase=bad) ||/ Name Version Architecture Description +++-====================================-=============================-============-========================================================= un libgldispatch0-nvidia (no description available) ii libnvidia-cfg1-555:amd64 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVIDIA binary OpenGL/GLX configuration library un libnvidia-cfg1-any (no description available) un libnvidia-common (no description available) ii libnvidia-common-555 555.52.04-0ubuntu0~gpu24.04.1 all Shared files used by the NVIDIA libraries un libnvidia-compute (no description available) ii libnvidia-compute-555:amd64 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVIDIA libcompute package ii libnvidia-container-tools 1.16.0-1 amd64 NVIDIA container runtime library (command-line tools) ii libnvidia-container1:amd64 1.16.0-1 amd64 NVIDIA container runtime library un libnvidia-decode (no description available) ii libnvidia-decode-555:amd64 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVIDIA Video Decoding runtime libraries un libnvidia-encode (no description available) ii libnvidia-encode-555:amd64 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVENC Video Encoding runtime library un libnvidia-extra (no description available) ii libnvidia-extra-555:amd64 555.52.04-0ubuntu0~gpu24.04.1 amd64 Extra libraries for the NVIDIA driver un libnvidia-fbc1 (no description available) ii libnvidia-fbc1-555:amd64 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVIDIA OpenGL-based Framebuffer Capture runtime library un libnvidia-gl (no description available) ii libnvidia-gl-555:amd64 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVIDIA OpenGL/GLX/EGL/GLES GLVND libraries and Vulkan ICD un libnvidia-ml.so.1 (no description available) un nvidia-384 (no description available) un nvidia-390 (no description available) un nvidia-compute-utils (no description available) ii nvidia-compute-utils-555 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVIDIA compute utilities un nvidia-container-runtime (no description available) un nvidia-container-runtime-hook (no description available) ii nvidia-container-toolkit 1.16.0-1 amd64 NVIDIA Container toolkit ii nvidia-container-toolkit-base 1.16.0-1 amd64 NVIDIA Container Toolkit Base ii nvidia-dkms-555 555.52.04-0ubuntu0~gpu24.04.1 amd64 NVIDIA DKMS package

 - [x] NVIDIA container library version from `nvidia-container-cli -V`

$ nvidia-container-cli -V cli-version: 1.16.0 lib-version: 1.16.0 build date: 2024-07-15T13:41+00:00 build revision: 4c2494f16573b585788a42e9c7bee76ecd48c73d build compiler: x86_64-linux-gnu-gcc-7 7.5.0 build platform: x86_64 build flags: -D_GNU_SOURCE -D_FORTIFY_SOURCE=2 -DNDEBUG -std=gnu11 -O2 -g -fdata-sections -ffunction-sections -fplan9-extensions -fstack-protector -fno-strict-aliasing -fvisibility=hidden -Wall -Wextra -Wcast-align -Wpointer-arith -Wmissing-prototypes -Wnonnull -Wwrite-strings -Wlogical-op -Wformat=2 -Wmissing-format-attribute -Winit-self -Wshadow -Wstrict-prototypes -Wunreachable-code -Wconversion -Wsign-conversion -Wno-unknown-warning-option -Wno-format-extra-args -Wno-gnu-alignof-expression -Wl,-zrelro -Wl,-znow -Wl,-zdefs -Wl,--gc-sections

- [x] Description of the node:

❯ k describe node ip-172-31-10-219 Name: ip-172-31-10-219 Roles: Labels: beta.kubernetes.io/arch=amd64 beta.kubernetes.io/os=linux gpu-memory-MiB=15360 kubernetes.io/arch=amd64 kubernetes.io/hostname=ip-172-31-10-219 kubernetes.io/os=linux nvidia.com/gpu.present=true Annotations: node.alpha.kubernetes.io/ttl: 0 volumes.kubernetes.io/controller-managed-attach-detach: true CreationTimestamp: Tue, 16 Jul 2024 18:00:09 +0900 Taints: nvidia.com/gpu=true:NoSchedule Unschedulable: false Lease: HolderIdentity: ip-172-31-10-219 AcquireTime: RenewTime: Tue, 16 Jul 2024 19:05:57 +0900 Conditions: Type Status LastHeartbeatTime LastTransitionTime Reason Message


MemoryPressure False Tue, 16 Jul 2024 19:01:30 +0900 Tue, 16 Jul 2024 18:00:09 +0900 KubeletHasSufficientMemory kubelet has sufficient memory available DiskPressure False Tue, 16 Jul 2024 19:01:30 +0900 Tue, 16 Jul 2024 18:00:09 +0900 KubeletHasNoDiskPressure kubelet has no disk pressure PIDPressure False Tue, 16 Jul 2024 19:01:30 +0900 Tue, 16 Jul 2024 18:00:09 +0900 KubeletHasSufficientPID kubelet has sufficient PID available Ready True Tue, 16 Jul 2024 19:01:30 +0900 Tue, 16 Jul 2024 18:00:19 +0900 KubeletReady kubelet is posting ready status Addresses: InternalIP: 172.31.10.219 Hostname: ip-172-31-10-219 Capacity: cpu: 4 ephemeral-storage: 100476656Ki hugepages-1Gi: 0 hugepages-2Mi: 0 memory: 16167172Ki pods: 110 Allocatable: cpu: 4 ephemeral-storage: 92599286017 hugepages-1Gi: 0 hugepages-2Mi: 0 memory: 16064772Ki pods: 110 System Info: Machine ID: ec234d677095858fadc478a73216d0cd System UUID: ec234d67-7095-858f-adc4-78a73216d0cd Boot ID: 8a0f75e9-7ed9-4d37-87c6-c1732b461aac Kernel Version: 6.8.0-1010-aws OS Image: Ubuntu 24.04 LTS Operating System: linux Architecture: amd64 Container Runtime Version: containerd://1.7.18 Kubelet Version: v1.30.2+k0s Kube-Proxy Version: v1.30.2+k0s PodCIDR: 10.244.3.0/24 PodCIDRs: 10.244.3.0/24 Non-terminated Pods: (4 in total) Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits Age


kube-system konnectivity-agent-bthh5 0 (0%) 0 (0%) 0 (0%) 0 (0%) 65m kube-system kube-proxy-wcbz6 0 (0%) 0 (0%) 0 (0%) 0 (0%) 65m kube-system kube-router-vt5f9 250m (6%) 0 (0%) 16Mi (0%) 0 (0%) 65m nvidia-device-plugin nvdp-nvidia-device-plugin-rkq99 0 (0%) 0 (0%) 0 (0%) 0 (0%) 11m Allocated resources: (Total limits may be over 100 percent, i.e., overcommitted.) Resource Requests Limits


cpu 250m (6%) 0 (0%) memory 16Mi (0%) 0 (0%) ephemeral-storage 0 (0%) 0 (0%) hugepages-1Gi 0 (0%) 0 (0%) hugepages-2Mi 0 (0%) 0 (0%) Events:

elezar commented 2 months ago

Note that your default containerd runtime is set to:

default_runtime_name = "runc"

this means, that unless you trigger the device plugin containers to use the nvidia runtime, they will not have the required access to the devices and drivers.

The simplest way to address this (if you don't want to update the default runtime to nvidia) is to use a runtime class. First create one on your cluster:

kubectl apply -f - <<EOF
apiVersion: node.k8s.io/v1
handler: nvidia
kind: RuntimeClass
metadata:
  name: nvidia

then instruct this to be used when running the device plugin:

helm upgrade -i nvdp nvdp/nvidia-device-plugin \
    --version=0.15.1 \
    --set runtimeClassName=nvidia \
    --namespace nvidia-device-plugin \
    --create-namespace \
    --set-file config.map.config=/tmp/nvidia-config.yaml

Note that workloads also need to use the nvidia runtime in this case.

EKami commented 2 months ago

Thanks a bunch, using nvidia as the default runtime indeed solved my problem :) .