NVIDIA / k8s-device-plugin

NVIDIA device plugin for Kubernetes
Apache License 2.0
2.86k stars 634 forks source link

Why doesn't my Kubernetes node recognize the GPU after successfully installing my drivers and Containerd? #788

Open xiaoxiaoboyyds opened 5 months ago

xiaoxiaoboyyds commented 5 months ago

The template below is mostly useful for bug reports and support questions. Feel free to remove anything which doesn't apply to you and add more information where it makes sense.

Important Note: NVIDIA AI Enterprise customers can get support from NVIDIA Enterprise support. Please open a case here.

1. Quick Debug Information

2. Issue or feature description

Why doesn't my Kubernetes node recognize the GPU after successfully installing my drivers and Containerd? This is the content of /etc/containerd/config.toml.

disabled_plugins = []
imports = []
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/containerd"
state = "/run/containerd"
temp = ""
version = 2

[cgroup]
  path = ""

[debug]
  address = ""
  format = ""
  gid = 0
  level = ""
  uid = 0

[grpc]
  address = "/run/containerd/containerd.sock"
  gid = 0
  max_recv_message_size = 16777216
  max_send_message_size = 16777216
  tcp_address = ""
  tcp_tls_ca = ""
  tcp_tls_cert = ""
  tcp_tls_key = ""
  uid = 0

[metrics]
  address = ""
  grpc_histogram = false

[plugins]

  [plugins."io.containerd.gc.v1.scheduler"]
    deletion_threshold = 0
    mutation_threshold = 100
    pause_threshold = 0.02
    schedule_delay = "0s"
    startup_delay = "100ms"

  [plugins."io.containerd.grpc.v1.cri"]
    cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
    device_ownership_from_security_context = false
    disable_apparmor = false
    disable_cgroup = false
    disable_hugetlb_controller = true
    disable_proc_mount = false
    disable_tcp_service = true
    drain_exec_sync_io_timeout = "0s"
    enable_cdi = false
    enable_selinux = false
    enable_tls_streaming = false
    enable_unprivileged_icmp = false
    enable_unprivileged_ports = false
    ignore_image_defined_volumes = false
    image_pull_progress_timeout = "1m0s"
    max_concurrent_downloads = 3
    max_container_log_line_size = 16384
    netns_mounts_under_state_dir = false
    restrict_oom_score_adj = false
    sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.8"
    selinux_category_range = 1024
    stats_collect_period = 10
    stream_idle_timeout = "4h0m0s"
    stream_server_address = "127.0.0.1"
    stream_server_port = "0"
    systemd_cgroup = false
    tolerate_missing_hugetlb_controller = true
    unset_seccomp_profile = ""

    [plugins."io.containerd.grpc.v1.cri".cni]
      bin_dir = "/opt/cni/bin"
      conf_dir = "/etc/cni/net.d"
      conf_template = ""
      ip_pref = ""
      max_conf_num = 1
      setup_serially = false

    [plugins."io.containerd.grpc.v1.cri".containerd]
      default_runtime_name = "runc"
      disable_snapshot_annotations = true
      discard_unpacked_layers = false
      ignore_blockio_not_enabled_errors = false
      ignore_rdt_not_enabled_errors = false
      no_pivot = false
      snapshotter = "overlayfs"

      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        privileged_without_host_devices_all_devices_allowed = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""
        sandbox_mode = ""
        snapshotter = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
          base_runtime_spec = ""
          cni_conf_dir = ""
          cni_max_conf_num = 0
          container_annotations = []
          pod_annotations = []
          privileged_without_host_devices = false
          privileged_without_host_devices_all_devices_allowed = false
          runtime_engine = ""
          runtime_path = ""
          runtime_root = ""
          runtime_type = "io.containerd.runc.v2"
          sandbox_mode = "podsandbox"
          snapshotter = ""

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
            BinaryName = "/usr/bin/nvidia-container-runtime"
            CriuImagePath = ""
            CriuPath = ""
            CriuWorkPath = ""
            IoGid = 0
            IoUid = 0
            NoNewKeyring = false
            NoPivotRoot = false
            Root = ""
            ShimCgroup = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          base_runtime_spec = ""
          cni_conf_dir = ""
          cni_max_conf_num = 0
          container_annotations = []
          pod_annotations = []
          privileged_without_host_devices = false
          privileged_without_host_devices_all_devices_allowed = false
          runtime_engine = ""
          runtime_path = ""
          runtime_root = ""
          runtime_type = "io.containerd.runc.v2"
          sandbox_mode = "podsandbox"
          snapshotter = ""

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
            BinaryName = ""
            CriuImagePath = ""
            CriuPath = ""
            CriuWorkPath = ""
            IoGid = 0
            IoUid = 0
            NoNewKeyring = false
            NoPivotRoot = false
            Root = ""
            ShimCgroup = ""

      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        privileged_without_host_devices_all_devices_allowed = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""
        sandbox_mode = ""
        snapshotter = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]

    [plugins."io.containerd.grpc.v1.cri".image_decryption]
      key_model = "node"

    [plugins."io.containerd.grpc.v1.cri".registry]
      config_path = "/etc/containerd/certs.d"

      [plugins."io.containerd.grpc.v1.cri".registry.auths]

      [plugins."io.containerd.grpc.v1.cri".registry.configs]

      [plugins."io.containerd.grpc.v1.cri".registry.headers]

      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]

    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
      tls_cert_file = ""
      tls_key_file = ""

  [plugins."io.containerd.internal.v1.opt"]
    path = "/opt/containerd"

  [plugins."io.containerd.internal.v1.restart"]
    interval = "10s"

  [plugins."io.containerd.internal.v1.tracing"]
    sampling_ratio = 1.0
    service_name = "containerd"

  [plugins."io.containerd.metadata.v1.bolt"]
    content_sharing_policy = "shared"

  [plugins."io.containerd.monitor.v1.cgroups"]
    no_prometheus = false

  [plugins."io.containerd.nri.v1.nri"]
    disable = true
    disable_connections = false
    plugin_config_path = "/etc/nri/conf.d"
    plugin_path = "/opt/nri/plugins"
    plugin_registration_timeout = "5s"
    plugin_request_timeout = "2s"
    socket_path = "/var/run/nri/nri.sock"

  [plugins."io.containerd.runtime.v1.linux"]
    no_shim = false
    runtime = "runc"
    runtime_root = ""
    shim = "containerd-shim"
    shim_debug = false

  [plugins."io.containerd.runtime.v2.task"]
    platforms = ["linux/amd64"]
    sched_core = false

  [plugins."io.containerd.service.v1.diff-service"]
    default = ["walking"]

  [plugins."io.containerd.service.v1.tasks-service"]
    blockio_config_file = ""
    rdt_config_file = ""

  [plugins."io.containerd.snapshotter.v1.aufs"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.btrfs"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.devmapper"]
    async_remove = false
    base_image_size = ""
    discard_blocks = false
    fs_options = ""
    fs_type = ""
    pool_name = ""
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.native"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.overlayfs"]
    root_path = ""
    upperdir_label = false

  [plugins."io.containerd.snapshotter.v1.zfs"]
    root_path = ""

  [plugins."io.containerd.tracing.processor.v1.otlp"]
    endpoint = ""
    insecure = false
    protocol = ""

  [plugins."io.containerd.transfer.v1.local"]
    config_path = ""
    max_concurrent_downloads = 3
    max_concurrent_uploaded_layers = 3

    [[plugins."io.containerd.transfer.v1.local".unpack_config]]
      differ = ""
      platform = "linux/amd64"
      snapshotter = "overlayfs"

[proxy_plugins]

[stream_processors]

  [stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
    accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
    args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
    env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
    path = "ctd-decoder"
    returns = "application/vnd.oci.image.layer.v1.tar"

  [stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
    accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
    args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
    env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
    path = "ctd-decoder"
    returns = "application/vnd.oci.image.layer.v1.tar+gzip"

[timeouts]
  "io.containerd.timeout.bolt.open" = "0s"
  "io.containerd.timeout.metrics.shimstats" = "2s"
  "io.containerd.timeout.shim.cleanup" = "5s"
  "io.containerd.timeout.shim.load" = "5s"
  "io.containerd.timeout.shim.shutdown" = "3s"
  "io.containerd.timeout.task.state" = "2s"

[ttrpc]
  address = ""
  gid = 0
  uid = 0

This is the content of nvidia-smi

Wed Jun 26 11:10:27 2024
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.157                Driver Version: 390.157                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  Off  | 00000000:00:08.0 Off |                    0 |
| N/A   31C    P0    21W / 300W |      0MiB / 32510MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

but

Name:               edgenode-4a9b
Roles:              agent,edge
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/os=linux
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=edgenode-4a9b
                    kubernetes.io/os=linux
                    node-role.kubernetes.io/agent=
                    node-role.kubernetes.io/edge=
Annotations:        node.alpha.kubernetes.io/ttl: 0
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Wed, 26 Jun 2024 10:48:04 +0800
Taints:             node-role.kubernetes.io/edge:NoSchedule
Unschedulable:      false
Lease:
  HolderIdentity:  edgenode-4a9b
  AcquireTime:     <unset>
  RenewTime:       Wed, 26 Jun 2024 10:48:25 +0800
Conditions:
  Type             Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
  ----             ------  -----------------                 ------------------                ------                       -------
  MemoryPressure   False   Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientMemory   kubelet has sufficient memory available
  DiskPressure     False   Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasNoDiskPressure     kubelet has no disk pressure
  PIDPressure      False   Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientPID      kubelet has sufficient PID available
  Ready            True    Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   EdgeReady                    edge is posting ready status. AppArmor enabled
Addresses:
  InternalIP:  10.206.16.8
  Hostname:    edgenode-4a9b
Capacity:
  cpu:                10
  ephemeral-storage:  1056753416Ki
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40284132Ki
  pods:               110
Allocatable:
  cpu:                10
  ephemeral-storage:  973903946574
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40181732Ki
  pods:               110
System Info:
  Machine ID:                 f969dca6f9284efcbd52866be3f39259
  System UUID:                f969dca6-f928-4efc-bd52-866be3f39259
  Boot ID:                    d91faf3a-fe03-4d3c-9873-44cba386a20c
  Kernel Version:             5.15.0-107-generic
  OS Image:                   Ubuntu 22.04 LTS
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  containerd://1.7.12
  Kubelet Version:            v1.23.15-kubeedge-v1.13.0
  Kube-Proxy Version:         v0.0.0-master+$Format:%H$
PodCIDR:                      192.168.7.0/24
PodCIDRs:                     192.168.7.0/24
Non-terminated Pods:          (0 in total)
  Namespace                   Name    CPU Requests  CPU Limits  Memory Requests  Memory Limits  Age
  ---------                   ----    ------------  ----------  ---------------  -------------  ---
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests  Limits
  --------           --------  ------
  cpu                0 (0%)    0 (0%)
  memory             0 (0%)    0 (0%)
  ephemeral-storage  0 (0%)    0 (0%)
  hugepages-1Gi      0 (0%)    0 (0%)
  hugepages-2Mi      0 (0%)    0 (0%)
Events:              <none>
root@master01:/home/ubuntu# kubectl describe  nodes edgenode-4a9b
Name:               edgenode-4a9b
Roles:              agent,edge
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/os=linux
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=edgenode-4a9b
                    kubernetes.io/os=linux
                    node-role.kubernetes.io/agent=
                    node-role.kubernetes.io/edge=
Annotations:        node.alpha.kubernetes.io/ttl: 0
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Wed, 26 Jun 2024 10:48:04 +0800
Taints:             node-role.kubernetes.io/edge:NoSchedule
Unschedulable:      false
Lease:
  HolderIdentity:  edgenode-4a9b
  AcquireTime:     <unset>
  RenewTime:       Wed, 26 Jun 2024 10:53:53 +0800
Conditions:
  Type             Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
  ----             ------  -----------------                 ------------------                ------                       -------
  MemoryPressure   False   Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientMemory   kubelet has sufficient memory available
  DiskPressure     False   Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasNoDiskPressure     kubelet has no disk pressure
  PIDPressure      False   Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientPID      kubelet has sufficient PID available
  Ready            True    Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   EdgeReady                    edge is posting ready status. AppArmor enabled
Addresses:
  InternalIP:  10.206.16.8
  Hostname:    edgenode-4a9b
Capacity:
  cpu:                10
  ephemeral-storage:  1056753416Ki
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40284132Ki
  pods:               110
Allocatable:
  cpu:                10
  ephemeral-storage:  973903946574
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40181732Ki
  pods:               110
System Info:
  Machine ID:                 f969dca6f9284efcbd52866be3f39259
  System UUID:                f969dca6-f928-4efc-bd52-866be3f39259
  Boot ID:                    d91faf3a-fe03-4d3c-9873-44cba386a20c
  Kernel Version:             5.15.0-107-generic
  OS Image:                   Ubuntu 22.04 LTS
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  containerd://1.7.12
  Kubelet Version:            v1.23.15-kubeedge-v1.13.0
  Kube-Proxy Version:         v0.0.0-master+$Format:%H$
PodCIDR:                      192.168.7.0/24
PodCIDRs:                     192.168.7.0/24
Non-terminated Pods:          (0 in total)
  Namespace                   Name    CPU Requests  CPU Limits  Memory Requests  Memory Limits  Age
  ---------                   ----    ------------  ----------  ---------------  -------------  ---
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests  Limits
  --------           --------  ------
  cpu                0 (0%)    0 (0%)
  memory             0 (0%)    0 (0%)
  ephemeral-storage  0 (0%)    0 (0%)
  hugepages-1Gi      0 (0%)    0 (0%)
  hugepages-2Mi      0 (0%)    0 (0%)
Events:              <none>
### Tasks
ZYWNB666 commented 5 months ago

First, the value of default_runtime_name in containerd should be nvidia. After setting the value, you need to follow the documentation to enable GPU Support in Kubernetes

Just one command.

$ kubectl create -f  https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml

Remember to restart containerd and kubelet

andrewmeyer commented 4 months ago

same error with a tesla card running a rancher deployment:

root@nvidia-device-plugin-daemonset-nvbr7:/# nvidia-device-plugin 
2024/07/21 15:56:05 Starting FS watcher.
2024/07/21 15:56:05 Starting OS watcher.
2024/07/21 15:56:05 Starting Plugins.
2024/07/21 15:56:05 Loading configuration.
2024/07/21 15:56:05 Updating config with default resource matching patterns.
2024/07/21 15:56:05 
Running with config:
{
  "version": "v1",
  "flags": {
    "migStrategy": "none",
    "failOnInitError": false,
    "nvidiaDriverRoot": "/",
    "gdsEnabled": false,
    "mofedEnabled": false,
    "plugin": {
      "passDeviceSpecs": false,
      "deviceListStrategy": "envvar",
      "deviceIDStrategy": "uuid"
    }
  },
  "resources": {
    "gpus": [
      {
        "pattern": "*",
        "name": "nvidia.com/gpu"
      }
    ]
  },
  "sharing": {
    "timeSlicing": {}
  }
}
2024/07/21 15:56:05 Retreiving plugins.
2024/07/21 15:56:05 Detected non-NVML platform: could not load NVML: libnvidia-ml.so.1: cannot open shared object file: No such file or directory
2024/07/21 15:56:05 Detected non-Tegra platform: /sys/devices/soc0/family file not found
2024/07/21 15:56:05 Incompatible platform detected
2024/07/21 15:56:05 If this is a GPU node, did you configure the NVIDIA Container Toolkit?
2024/07/21 15:56:05 You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
2024/07/21 15:56:05 You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
2024/07/21 15:56:05 If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
2024/07/21 15:56:05 No devices found. Waiting indefinitely.

EDIT:

after reviewing more documetnation and fixing some issues within my config file, I get a different error:

}
I0721 17:00:46.190858      46 main.go:317] Retrieving plugins.
E0721 17:00:46.191062      46 factory.go:87] Incompatible strategy detected auto
E0721 17:00:46.191076      46 factory.go:88] If this is a GPU node, did you configure the NVIDIA Container Toolkit?
E0721 17:00:46.191084      46 factory.go:89] You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
E0721 17:00:46.191093      46 factory.go:90] You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
E0721 17:00:46.191101      46 factory.go:91] If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
I0721 17:00:46.191112      46 main.go:346] No devices found. Waiting indefinitely.
MasonXon commented 2 months ago

Please ask if this problem has been solved?

harshsavasil commented 1 month ago

Can someone share any findings on this issue? I've spent the entire last weekend to get this working. But can't seem to make it work.

MasonXon commented 1 month ago

I patched daemonset nvdp-nvidia-device-plugin with following command:

kubectl -n nvidia-device-plugin patch ds nvdp-nvidia-device-plugin \
  --type='json' \
  -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--device-discovery-strategy=tegra"]}]'

This is equivalent to manually specifying the detection strategy to tegra. My GPU is 4090.

harshsavasil commented 1 month ago

Thanks @MasonXon ! It worked.

mattlgy commented 1 month ago

I had to set the default_runtime_name to nvidia, like @ZYWNB666 recommend. nvidia-ctk runtime configure --runtime=containerd added all the runtime configs for the ctk, but did not change that line.

After manually editing /etc/containerd/config.toml, restarting containerd via systemctl, and restarting the deamonset pod it worked!

klueska commented 1 month ago

Adding --set-as-default would have given you what you want:

nvidia-ctk runtime configure --runtime=containerd --set-as-default