google / gvisor

Application Kernel for Containers
https://gvisor.dev
Apache License 2.0
15.84k stars 1.3k forks source link

starting container with gvisor and NVIDIA gpu is getting hanged. #10997

Closed PSKP-95 closed 3 weeks ago

PSKP-95 commented 1 month ago

Description

Using g4dn.xlarge ec2 instance on AWS which has Tesla T4. Installed NVIDIA driver 550.90.07 which is supported by nvproxy. checked by running runsc nvproxy list-supported-drivers

image

Then installed nvidia-container-runtime using below command & then configured for containerd. It added runtime in /etc/containerd/config.toml

curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
  sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
sudo yum install -y nvidia-container-toolkit

sudo nvidia-ctk runtime configure --runtime=containerd
sudo systemctl restart containerd

As per gvisor documentation, created bash script with below content and kept on PATH (/usr/local/bin/runscgpu

#!/bin/bash
exec /usr/local/bin/runsc --nvproxy --debug-log=/tmp/runsc/ --debug --strace "$@"

Here is nvidia-container-runtime configuration. Added runscgpu as runtime.

#accept-nvidia-visible-devices-as-volume-mounts = false
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
disable-require = false
supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,video"
#swarm-resource = "DOCKER_RESOURCE_GPU"

[nvidia-container-cli]
#debug = "/var/log/nvidia-container-toolkit.log"
environment = []
#ldcache = "/etc/ld.so.cache"
ldconfig = "@/sbin/ldconfig"
load-kmods = true
#no-cgroups = false
#path = "/usr/bin/nvidia-container-cli"
#root = "/run/nvidia/driver"
#user = "root:video"

[nvidia-container-runtime]
debug = "/var/log/nvidia-container-runtime.log"
log-level = "info"
mode = "legacy"
runtimes = ["runscgpu", "runc"]

[nvidia-container-runtime.modes]

[nvidia-container-runtime.modes.cdi]
annotation-prefixes = ["cdi.k8s.io/"]
default-kind = "nvidia.com/gpu"
spec-dirs = ["/etc/cdi", "/var/run/cdi"]

[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime-hook]
path = "nvidia-container-runtime-hook"
skip-mode-detection = false

[nvidia-ctk]
path = "nvidia-ctk"

Steps to reproduce

pod.json

{
    "metadata": {
        "name": "nvidia-sandbox",
        "namespace": "default",
        "attempt": 1,
        "uid": "4dis4d93djaidwnduwk28bcsb"
    },
    "linux": {
    },
    "log_directory": "/tmp",
    "port_mappings": [
        {
          "protocol": 0,
          "container_port": 80,
          "host_port": 8081
        }
    ]
}

container.json

{
  "metadata": {
      "name": "nginx-demo"
    },
  "image":{
      "image": "nvidia/cuda:12.4.1-base-ubuntu22.04"
    },
  "command": ["nvidia-smi"],
  "log_path":"nvidismi.0.log",
  "linux": {
  },
  "port": [
    {
      "container_port": 80,
      "host_port": 8080,
      "protocol": "TCP"
    }
  ]
}

runsc version

runsc version release-20240930.0
spec: 1.1.0-rc.1

docker version (if using docker)

containerd 1.7.22

uname

Linux ip-172-21-148-200.eu-central-1.compute.internal 6.1.109-118.189.amzn2023.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Sep 10 08:59:12 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux

kubectl (if using Kubernetes)

No response

repo state (if built from source)

No response

runsc debug logs (if available)

[runsc.log.20241006-163925.472943.create.txt](https://github.com/user-attachments/files/17270241/runsc.log.20241006-163925.472943.create.txt)
[runsc.log.20241006-163925.472943.gofer.txt](https://github.com/user-attachments/files/17270242/runsc.log.20241006-163925.472943.gofer.txt)
[runsc.log.20241006-163925.472943.boot.txt](https://github.com/user-attachments/files/17270243/runsc.log.20241006-163925.472943.boot.txt)
[runsc.log.20241006-163925.872791.start.txt](https://github.com/user-attachments/files/17270244/runsc.log.20241006-163925.872791.start.txt)
[runsc.log.20241006-163952.772808.create.txt](https://github.com/user-attachments/files/17270245/runsc.log.20241006-163952.772808.create.txt)
[runsc.log.20241006-163952.832728.gofer.txt](https://github.com/user-attachments/files/17270246/runsc.log.20241006-163952.832728.gofer.txt)
[runsc.log.20241006-163952.832728.start.txt](https://github.com/user-attachments/files/17270247/runsc.log.20241006-163952.832728.start.txt)
PSKP-95 commented 1 month ago

runsc.log.20241006-163925.472943.create.txt runsc.log.20241006-163925.472943.gofer.txt runsc.log.20241006-163925.472943.boot.txt runsc.log.20241006-163925.872791.start.txt runsc.log.20241006-163952.772808.create.txt runsc.log.20241006-163952.832728.gofer.txt runsc.log.20241006-163952.832728.start.txt

PSKP-95 commented 1 month ago

From host

image

With runc

image

PSKP-95 commented 1 month ago
Logs from containerd
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.657866697Z" level=info msg="Connect containerd service"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.657899997Z" level=info msg="using legacy CRI server"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.657907197Z" level=info msg="using experimental NRI integration - disable nri plugin to prevent this"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.658022427Z" level=info msg="Get image filesystem path \"/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs\""
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.658620264Z" level=info msg="Start subscribing containerd event"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.658667068Z" level=info msg="Start recovering state"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.659069824Z" level=info msg=serving... address=/run/containerd/containerd.sock.ttrpc
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.659124575Z" level=info msg=serving... address=/run/containerd/containerd.sock
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.685451404Z" level=info msg="Start event monitor"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.685486448Z" level=info msg="Start snapshots syncer"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.685500663Z" level=info msg="Start cni network conf syncer for default"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.685510377Z" level=info msg="Start streaming server"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:11.685822080Z" level=info msg="containerd successfully booted in 0.054443s"
Oct 06 17:12:11 ip-172-21-148-200.eu-central-1.compute.internal systemd[1]: Started containerd.service - containerd container runtime.
Oct 06 17:12:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:15.404185461Z" level=warning msg="CRI API v1alpha2 is deprecated since containerd v1.7 and removed in containerd v2.0. Use CRI API v1 instead."
Oct 06 17:12:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:15.404245703Z" level=info msg="RunPodSandbox for name:\"nvidia-sandbox\"  uid:\"4dis4d93djaidwnduwk28bcsb\"  namespace:\"default\"  attempt:1"
Oct 06 17:12:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:15.468335678Z" level=info msg="loading plugin \"io.containerd.event.v1.publisher\"..." runtime=io.containerd.runc.v2 type=io.containerd.event.v1
Oct 06 17:12:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:15.468429310Z" level=info msg="loading plugin \"io.containerd.internal.v1.shutdown\"..." runtime=io.containerd.runc.v2 type=io.containerd.internal.v1
Oct 06 17:12:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:15.468477556Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.task\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
Oct 06 17:12:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:15.468581488Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.pause\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
Oct 06 17:12:16 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:16.237013494Z" level=info msg="RunPodSandbox for name:\"nvidia-sandbox\"  uid:\"4dis4d93djaidwnduwk28bcsb\"  namespace:\"default\"  attempt:1 returns sandbox id \"66b21837154c0ed4bdbac1787f81e77aaf54cbd394c3817dced34f09a25fe288\""
Oct 06 17:12:21 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:21.115194308Z" level=info msg="CreateContainer within sandbox \"66b21837154c0ed4bdbac1787f81e77aaf54cbd394c3817dced34f09a25fe288\" for container name:\"nginx-demo\""
Oct 06 17:12:21 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:21.131078206Z" level=info msg="CreateContainer within sandbox \"66b21837154c0ed4bdbac1787f81e77aaf54cbd394c3817dced34f09a25fe288\" for name:\"nginx-demo\" returns container id \"c1f5075342edf730d86bb047d7e42acaa51bbd322e6176e18918908dc1f33342\""
Oct 06 17:12:28 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:12:28.083554423Z" level=info msg="StartContainer for \"c1f5075342edf730d86bb047d7e42acaa51bbd322e6176e18918908dc1f33342\""
<<<<<--------------STUCK HERE. CTRL+C PRINTED BELOW LOGS -------------->>>>>>>>
Oct 06 17:18:37 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:18:37.484904434Z" level=error msg="Failed to delete containerd task \"c1f5075342edf730d86bb047d7e42acaa51bbd322e6176e18918908dc1f33342\"" error="context deadline exceeded: unknown"
Oct 06 17:18:37 ip-172-21-148-200.eu-central-1.compute.internal containerd[3328]: time="2024-10-06T17:18:37.486735164Z" level=error msg="StartContainer for \"c1f5075342edf730d86bb047d7e42acaa51bbd322e6176e18918908dc1f33342\" failed" error="failed to start containerd task \"c1f5075342edf730d86bb047d7e42acaa51bbd322e6176e18918908dc1f33342\": context canceled: unknown"

Stuck case screenshot

image

ayushr2 commented 1 month ago

I can see from logs that you have a multi-container set up. The first container is a pause container. The second container is nvidia-smi container. Both have the nvidia-container-runtime-hook prestart hook defined. But only the second container has GPU access (via the NVIDIA_VISIBLE_DEVICES env var). However, while starting the second container, runsc fails with:

D1006 16:39:54.841380       1 controller.go:425] containerManager.StartSubcontainer failed, cid: 36d8cb46d788933393d1fbf9c79a673534e306e43b3c243c1a38905a8bd09e44, args: &{Spec:0xc0003bc750 Conf:0xc000460608 CID:36d8cb46d788933393d1fbf9c79a673534e306e43b3c243c1a38905a8bd09e44 NumGoferFilestoreFDs:1 IsDevIoFilePresent:true GoferMountConfs:[lisafs:self lisafs:none lisafs:none lisafs:none lisafs:none] FilePayload:{Files:[0xc000406030 0xc000406040 0xc000406048 0xc000406050 0xc000406058 0xc000406060 0xc000406068 0xc000406070 0xc000406078 0xc000406080]}}, err: failed to create device files: dev gofer client not found in context

I think https://github.com/google/gvisor/pull/10998 should fix this issue. Let me know if it works.

PSKP-95 commented 1 month ago

@ayushr2 It isn't getting stuck now. Thanks 👍🏼. One container started successfully, but after that, all containers remained running. They should exit after displaying the output of the nvidia-smi command.

image

Gvisor Logs

There are some unsupported syscalls & some CPU architecture-related warnings. But it ran successfully once. if you see logs runsc.log.20241007-111003.778099.boot.txt, it is doing write syscall with expected output from nvidia-smi.

runsc.log.20241007-111033.311499.gofer.txt runsc.log.20241007-111033.311499.start.txt runsc.log.20241007-111033.221455.create.txt runsc.log.20241007-111003.778099.create.txt runsc.log.20241007-111003.778099.gofer.txt runsc.log.20241007-111004.411550.start.txt runsc.log.20241007-111003.778099.boot.txt

Containerd logs saying container started successfully

Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.469777343Z" level=info msg="Connect containerd service"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.469831689Z" level=info msg="using legacy CRI server"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.469842574Z" level=info msg="using experimental NRI integration - disable nri plugin to prevent this"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.479172909Z" level=info msg="Get image filesystem path \"/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs\""
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.485851774Z" level=info msg="Start subscribing containerd event"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.485938670Z" level=info msg="Start recovering state"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.486448575Z" level=info msg=serving... address=/run/containerd/containerd.sock.ttrpc
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.486510303Z" level=info msg=serving... address=/run/containerd/containerd.sock
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.580993470Z" level=info msg="Start event monitor"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.581038642Z" level=info msg="Start snapshots syncer"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.581053527Z" level=info msg="Start cni network conf syncer for default"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.581067104Z" level=info msg="Start streaming server"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:08:41.581204031Z" level=info msg="containerd successfully booted in 0.327821s"
Oct 07 11:08:41 ip-172-21-148-200.eu-central-1.compute.internal systemd[1]: Started containerd.service - containerd container runtime.
Oct 07 11:09:04 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:09:04.505604212Z" level=warning msg="CRI API v1alpha2 is deprecated since containerd v1.7 and removed in containerd v2.0. Use CRI API v1 instead."
Oct 07 11:09:33 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:09:33.394751064Z" level=info msg="RemovePodSandbox for \"53\""
Oct 07 11:09:33 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:09:33.394797418Z" level=info msg="Forcibly stopping sandbox \"5378e79be3110f9f35c45eac2dfc3d4046ab289b18511dd1ac834b3fc37043d1\""
Oct 07 11:09:33 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:09:33.394891114Z" level=info msg="Container to stop \"35a9c8e02065355cc208c3fe4b8d5f1414c41580f42dc9707f441bd551bd4a57\" must be in running or unknown state, current state \"CONTAINER_EXITED\""
Oct 07 11:09:33 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:09:33.491107559Z" level=info msg="TearDown network for sandbox \"5378e79be3110f9f35c45eac2dfc3d4046ab289b18511dd1ac834b3fc37043d1\" successfully"
Oct 07 11:09:33 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:09:33.498772834Z" level=warning msg="Failed to get podSandbox status for container event for sandboxID \"5378e79be3110f9f35c45eac2dfc3d4046ab289b18511dd1ac834b3fc37043d1\": an error occurred when try to find sandbox: not found. Sending the event with nil podSandboxStatus."
Oct 07 11:09:33 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:09:33.498820561Z" level=info msg="RemovePodSandbox \"53\" returns successfully"
Oct 07 11:10:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:03.062126416Z" level=info msg="RunPodSandbox for name:\"nvidia-pod\"  uid:\"4dis4d93djbjdwnduwk28bcsb\"  namespace:\"default\"  attempt:1"
Oct 07 11:10:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:03.500039611Z" level=info msg="loading plugin \"io.containerd.event.v1.publisher\"..." runtime=io.containerd.runc.v2 type=io.containerd.event.v1
Oct 07 11:10:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:03.500095379Z" level=info msg="loading plugin \"io.containerd.internal.v1.shutdown\"..." runtime=io.containerd.runc.v2 type=io.containerd.internal.v1
Oct 07 11:10:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:03.500105916Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.task\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
Oct 07 11:10:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:03.500672555Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.pause\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
Oct 07 11:10:04 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:04.461415044Z" level=info msg="RunPodSandbox for name:\"nvidia-pod\"  uid:\"4dis4d93djbjdwnduwk28bcsb\"  namespace:\"default\"  attempt:1 returns sandbox id \"cb1089a47df105217103af9878d001a1826b9d0b0f7b76b55e75e948124d0b0e\""
Oct 07 11:10:13 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:13.248518350Z" level=info msg="CreateContainer within sandbox \"cb1089a47df105217103af9878d001a1826b9d0b0f7b76b55e75e948124d0b0e\" for container name:\"nvidia-demo\""
Oct 07 11:10:13 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:13.272489408Z" level=info msg="CreateContainer within sandbox \"cb1089a47df105217103af9878d001a1826b9d0b0f7b76b55e75e948124d0b0e\" for name:\"nvidia-demo\" returns container id \"a79ba5124dfbefd5e18262183a60d656c655452836a6b9bb1b7d8d22cdf89e03\""
Oct 07 11:10:33 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:33.136682656Z" level=info msg="StartContainer for \"a79ba5124dfbefd5e18262183a60d656c655452836a6b9bb1b7d8d22cdf89e03\""
Oct 07 11:10:37 ip-172-21-148-200.eu-central-1.compute.internal containerd[1883]: time="2024-10-07T11:10:37.455718428Z" level=info msg="StartContainer for \"a79ba5124dfbefd5e18262183a60d656c655452836a6b9bb1b7d8d22cdf89e03\" returns successfully"
ayushr2 commented 1 month ago

Can you show all log files? Where there no runsc.log.*.wait.txt or runsc.log.*.delete.txt log files? Usually containerd checks if the container has stopped via runsc wait, which will update the container state to STOPPED.

With GPU containers, there is a "device gofer connection". This connection is only cleaned up when the container is destroyed: https://github.com/google/gvisor/blob/cbbd0b4e18b72d5cb4866cd5db8b6ab6ebbedbd1/runsc/boot/loader.go#L1289-L1290

The container is destroyed with runsc delete command. But I don't see that runsc delete a79ba5124dfbefd5e18262183a60d656c655452836a6b9bb1b7d8d22cdf89e03 was called. Because of this, the gofer process for the second container is still running. I am not sure if this is the reason the container is reported as RUNNING.

The boot logs show that the nvidia-smi binary ran successfully and exited with code 0:

I1007 11:10:37.451600       1 strace.go:561] [   2:   2] nvidia-smi E exit_group(0x0)
I1007 11:10:37.451610       1 strace.go:599] [   2:   2] nvidia-smi X exit_group(0x0) = 0 (0x0) (1.271µs)
D1007 11:10:37.451615       1 task_exit.go:214] [   2:   2] Transitioning from exit state TaskExitNone to TaskExitInitiated

Does this issue not happen without gVisor? i.e. try using just runc. Is the container status = STOPPED without calling runc wait or runc delete?

ayushr2 commented 1 month ago

Because of this, the gofer process for the second container is still running. I am not sure if this is the reason the container is reported as RUNNING.

If somehow this is the case, then https://github.com/google/gvisor/pull/11003 should fix the issue. Could you give that a shot as well?

PSKP-95 commented 1 month ago

11003 did not work.

there were no runsc.log.*.wait.txt or runsc.log.*.delete.txt log files generated. Below got runsc.log.*.delete.txt maybe because I tried to delete the pod and it got hanged for delete pod as well (sudo crictl rmp $SANDBOX_ID)

runsc debug logs after #11003

runsc.log.20241008-131757.184866.start.txt runsc.log.20241008-131757.124916.create.txt runsc.log.20241008-131757.184866.gofer.txt runsc.log.20241008-131740.351540.gofer.txt runsc.log.20241008-131740.796128.start.txt runsc.log.20241008-131740.351540.boot.txt runsc.log.20241008-131740.351540.create.txt runsc.log.20241008-131727.790706.gofer.txt runsc.log.20241008-131727.984895.delete.txt runsc.log.20241008-131727.790706.boot.txt runsc.log.20241008-131727.790706.create.txt

With runc

image

Containerd logs when using runc

Sandbox Id = 6e54242964a56c2c6c03c406ad8f11c82025d54304115db484f5789672a42712 Container Id = f93ae8ba765f450ee38b734a787a9b6e4987cf0efe945a5ddbafb162114d58bb

Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.585746535Z" level=info msg="Connect containerd service"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.586346086Z" level=info msg="using legacy CRI server"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.586367007Z" level=info msg="using experimental NRI integration - disable nri plugin to prevent this"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.589678127Z" level=info msg="Get image filesystem path \"/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs\""
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.600121640Z" level=info msg="Start subscribing containerd event"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.600172737Z" level=info msg="Start recovering state"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.603789630Z" level=info msg=serving... address=/run/containerd/containerd.sock.ttrpc
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.603893767Z" level=info msg=serving... address=/run/containerd/containerd.sock
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.723953278Z" level=info msg="Start event monitor"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.724161304Z" level=info msg="Start snapshots syncer"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.724253137Z" level=info msg="Start cni network conf syncer for default"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.724310411Z" level=info msg="Start streaming server"
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal systemd[1]: Started containerd.service - containerd container runtime.
Oct 08 13:23:03 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:03.725714699Z" level=info msg="containerd successfully booted in 0.338804s"
Oct 08 13:23:58 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:58.562010731Z" level=warning msg="CRI API v1alpha2 is deprecated since containerd v1.7 and removed in containerd v2.0. Use CRI API v1 instead."
Oct 08 13:23:58 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:58.562067037Z" level=info msg="RunPodSandbox for name:\"nvidia-pod\"  uid:\"4dis4d93djbjdwnduwk28bcsb\"  namespace:\"default\"  attempt:1"
Oct 08 13:23:58 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:23:58.564331540Z" level=error msg="RunPodSandbox for name:\"nvidia-pod\"  uid:\"4dis4d93djbjdwnduwk28bcsb\"  namespace:\"default\"  attempt:1 failed, error" error="failed to reserve sandbox name \"nvidia-pod_default_4dis4d93djbjdwnduwk28bcsb_1\": name \"nvidia-pod_default_4dis4d93djbjdwnduwk28bcsb_1\" is reserved for \"164309930963e856ba50d3c7ad95451e437a9ac7008e5520226dbd0cd5ad5fad\""
Oct 08 13:24:14 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:14.976067023Z" level=info msg="RemovePodSandbox for \"16\""
Oct 08 13:24:14 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:14.976104137Z" level=info msg="Forcibly stopping sandbox \"164309930963e856ba50d3c7ad95451e437a9ac7008e5520226dbd0cd5ad5fad\""
Oct 08 13:24:14 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:14.976174086Z" level=info msg="Container to stop \"a807e352ee84ba5bb274c3433370567f6f7c29c3e37fb99f5975a17ac5bb2051\" must be in running or unknown state, current state \"CONTAINER_EXITED\""
Oct 08 13:24:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:15.065769322Z" level=info msg="TearDown network for sandbox \"164309930963e856ba50d3c7ad95451e437a9ac7008e5520226dbd0cd5ad5fad\" successfully"
Oct 08 13:24:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:15.074044833Z" level=warning msg="Failed to get podSandbox status for container event for sandboxID \"164309930963e856ba50d3c7ad95451e437a9ac7008e5520226dbd0cd5ad5fad\": an error occurred when try to find sandbox: not found. Sending the event with nil podSandboxStatus."
Oct 08 13:24:15 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:15.074086677Z" level=info msg="RemovePodSandbox \"16\" returns successfully"
Oct 08 13:24:28 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:28.746934343Z" level=info msg="RunPodSandbox for name:\"nvidia-pod\"  uid:\"4dis4d93djbjdwnduwk28bcsb\"  namespace:\"default\"  attempt:1"
Oct 08 13:24:29 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:29.109218272Z" level=info msg="loading plugin \"io.containerd.event.v1.publisher\"..." runtime=io.containerd.runc.v2 type=io.containerd.event.v1
Oct 08 13:24:29 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:29.109269962Z" level=info msg="loading plugin \"io.containerd.internal.v1.shutdown\"..." runtime=io.containerd.runc.v2 type=io.containerd.internal.v1
Oct 08 13:24:29 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:29.109282534Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.task\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
Oct 08 13:24:29 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:29.110738783Z" level=info msg="loading plugin \"io.containerd.ttrpc.v1.pause\"..." runtime=io.containerd.runc.v2 type=io.containerd.ttrpc.v1
Oct 08 13:24:29 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:29.381836526Z" level=info msg="RunPodSandbox for name:\"nvidia-pod\"  uid:\"4dis4d93djbjdwnduwk28bcsb\"  namespace:\"default\"  attempt:1 returns sandbox id \"6e54242964a56c2c6c03c406ad8f11c82025d54304115db484f5789672a42712\""
Oct 08 13:24:38 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:38.639403176Z" level=info msg="CreateContainer within sandbox \"6e54242964a56c2c6c03c406ad8f11c82025d54304115db484f5789672a42712\" for container name:\"nvidia-demo\""
Oct 08 13:24:38 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:38.665571706Z" level=info msg="CreateContainer within sandbox \"6e54242964a56c2c6c03c406ad8f11c82025d54304115db484f5789672a42712\" for name:\"nvidia-demo\" returns container id \"f93ae8ba765f450ee38b734a787a9b6e4987cf0efe945a5ddbafb162114d58bb\""
Oct 08 13:24:44 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:44.516655104Z" level=info msg="StartContainer for \"f93ae8ba765f450ee38b734a787a9b6e4987cf0efe945a5ddbafb162114d58bb\""
Oct 08 13:24:46 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:46.784472753Z" level=info msg="StartContainer for \"f93ae8ba765f450ee38b734a787a9b6e4987cf0efe945a5ddbafb162114d58bb\" returns successfully"
Oct 08 13:24:48 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:48.824507883Z" level=info msg="shim disconnected" id=f93ae8ba765f450ee38b734a787a9b6e4987cf0efe945a5ddbafb162114d58bb namespace=k8s.io
Oct 08 13:24:48 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:48.824660218Z" level=warning msg="cleaning up after shim disconnected" id=f93ae8ba765f450ee38b734a787a9b6e4987cf0efe945a5ddbafb162114d58bb namespace=k8s.io
Oct 08 13:24:48 ip-172-21-148-200.eu-central-1.compute.internal containerd[1881]: time="2024-10-08T13:24:48.824680627Z" level=info msg="cleaning up dead shim" namespace=k8s.io
ayushr2 commented 1 month ago

I tried to delete the pod and it got hanged for delete pod as well (sudo crictl rmp $SANDBOX_ID)

I think that was for a different container. The container ID and sandbox ID in the delete.txt file does not match other files. There were no delete or wait commands on the second container a807e352ee84ba5bb274c3433370567f6f7c29c3e37fb99f5975a17ac5bb2051. In k8s setups, I have seen that containerd usually issues a wait command on the application container. When that returns, it issues delete command on the application container and pause container which ends up cleaning up the sandbox. IDK why in this set up, there are no wait commands.

ayushr2 commented 1 month ago

@PSKP-95 Does this issue not occur with runc (without gVisor)?

PSKP-95 commented 1 month ago

Yes , it does not occur with runc. I have added screenshot of it in my previous comment also added logs of containerd with runc.

zkoopmans commented 1 month ago

@PSKP-95 : Have you considered using this setup instead: https://gvisor.dev/docs/user_guide/containerd/quick_start/ ?

Our own k8s setup on GKE has us use a shim for containerd (containerd-shim-runsc-v1) which translates commands from containerd to commands to runsc. This may be why some commands that @ayushr2 references don't appear in logs.

Your setup is not something I've seen before.

PSKP-95 commented 4 weeks ago

Hey @zkoopmans , so I've followed the same doc to get Gvisor and the containerd up and running, which is working out great so far... but when I try to use it with the GPU, I'm getting an error. It is only with GPU

In the case of GPU, I have to use NVIDIA shim. That's what I understood from the docs.

zkoopmans commented 4 weeks ago

@PSKP-95 : You shouldn't need to use the NVIDIA shim.

My guess is that the error you're running into with the gVisor shim is that the container crashes rather than hangs? This is because the sandboxed application tries to reach out to the GPU resource, but the sandbox doesn't have the nvproxy flag set in its configs, so the calls to the device are denied.

I suspect you need to set flags in the gVisor shim configuration, usually /ect/containerd/runsc.toml: https://gvisor.dev/docs/user_guide/containerd/configuration/

You want the nvproxy flag set in there.

ayushr2 commented 4 weeks ago

@zkoopmans From the debug logs provided above, it looks like the nvproxy flag is set.

zkoopmans commented 4 weeks ago

@zkoopmans From the debug logs provided above, it looks like the nvproxy flag is set.

I think that's with the NVIDIA shim method he was using, no?

If he's using the gVisor shim, he'll need to set the flags in the runsc config instead of the custom runsc call he has above.

ayushr2 commented 3 weeks ago

I think that's with the NVIDIA shim method he was using, no?

Not sure if there is nvidia shim involved here. He's using the nvidia-container-runtime, which in turn uses runsc under the hood as per the /etc/nvidia-container-runtime/config.toml configuration posted in the description (see the runtimes= field in that file). But you might be correct that runsc shim is not being invoked. I have a hypothesis of why.

@PSKP-95 I mostly have the environment set up. I am able to run non-GPU pods with crictl runp --runtime runsc. I can confirm that containers do exit correctly and containerd is issuing wait and delete commands.

But in your case, you are using --runtime=nvidia, which is configured in containerd with sudo nvidia-ctk runtime configure --runtime=containerd. Could you show the contents of your /etc/containerd/config.toml file? I noticed that nvidia-ctk runtime configure set up this file with:

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
          runtime_type = "io.containerd.runc.v2"

You might need to update that to runtime_type = "io.containerd.runsc.v1" so that nvidia runtime ends up using runsc shim. Because containerd thinks that the "nvidia" runtime is using runc under the hood. But as per your configuration, it will end up using runscgpu.

P.S. I have still not been able to configure the nvidia runtime correctly in my setup. My /etc/containerd/config.toml file is:

version = 2

[plugins]

  [plugins."io.containerd.grpc.v1.cri"]

    [plugins."io.containerd.grpc.v1.cri".containerd]

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
          runtime_type = "io.containerd.runsc.v1"

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
            BinaryName = "/usr/bin/nvidia-container-runtime"

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          runtime_type = "io.containerd.runc.v2"

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runsc]
          runtime_type = "io.containerd.runsc.v1"

  [plugins."io.containerd.runtime.v1.linux"]
    shim_debug = true

Even though I fixed the runtime_type for nvidia runtime, my sudo crictl runp --runtime nvidia pod.json commands fail with run pod sandbox failed: rpc error: code = Unknown desc = failed to reserve sandbox name. I don't understand what's going on. But it seems like your set up doesn't have this problem. So I am hoping that just fixing the runtime_type field will fix the issue for you.

ayushr2 commented 3 weeks ago

Okay I was able to fix my setup (the issue was there was some lingering pod in the background somehow).

I can confirm that fixing /etc/containerd/config.toml to specify runtime_type = "io.containerd.runsc.v1" under [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] fixes this issue. The issue was that containerd was trying to invoke runc shim while nvidia runtime ran the pod with runsc.

I will update our docs to mention this. This is not obvious. Unfortunately, sudo nvidia-ctk runtime configure assumes that runc is being used.

ayushr2 commented 3 weeks ago

Unfortunately, sudo nvidia-ctk runtime configure assumes that runc is being used.

I found the nvidia-container-toolkit pointers for this:

Hence, the only way to get containerd to use runsc shim correctly alongside nvidia-container-runtime right now is to manually modify /etc/containerd/config.toml after running nvidia-ctk runtime configure and updating plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia => runtime_type to "io.containerd.runsc.v1".

PSKP-95 commented 3 weeks ago

Hi @ayushr2, are you able to get output from nvidia-smi? did you added exec script in /etc/nvidia-container-runtime/config.toml? I tried changing runtime_type in /etc/containerd/config.toml

FATA[0003] Starting the container "cda53658e7ac6e52128fd85700da99f8e20cfea10fea1175f01f920c4d524ba0" failed: 
rpc error: code = Unknown desc = failed to start containerd task "cda53658e7ac6e52128fd85700da99f8e20cfea10fea1175f01f920c4d524ba0": 
OCI runtime start failed: starting container: starting sub-container [nvidia-smi]: 
error finding executable "nvidia-smi" in PATH [/usr/local/nvidia/bin /usr/local/cuda/bin 
/usr/local/sbin /usr/local/bin /usr/sbin /usr/bin /sbin /bin]: no such file or directory: unknown

I was hoping, it will mount necessary devices and executables like it does in case of runc.

ayushr2 commented 3 weeks ago

Ah I see what's going on. In my setup, I am also passing --nvproxy-docker flag. This is deprecated flag, which (for backward compatibility) forces the nvidia prestart hook to be added.

When I remove that flag, I get the same error you are getting. Upon inspecting the OCI spec being passed to runsc, I don't see the prestart hook in the OCI spec of the nvidia-smi container. The nvidia-container-runtime is supposed to be adding the prestart hook in the legacy mode. From what I can tell, /etc/nvidia-container-runtime/config.toml is configured correctly to enable legacy mode.

Let me try to figure out what's going wrong. Until then, you can use --nvproxy-docker as a workaround.

ayushr2 commented 3 weeks ago

Okay I figured it out. nvidia-ctk runtime configure adds the following lines:

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
            BinaryName = "/usr/bin/nvidia-container-runtime"

This is to instruct the runtime shim to invoke /usr/bin/nvidia-container-runtime instead of runsc/runc binary. This allows nvidia-container-runtime to inject necessary fields in the OCI spec beforce invoking runsc/runc.

However, this format of options is not understood by runsc shim. To specify options for the runsc shim, we need to use the format described in https://gvisor.dev/docs/user_guide/containerd/configuration/.

Here is what my setup looks like and I can confirm that your demo works:

[plugins]

[plugins."io.containerd.grpc.v1.cri"]

[plugins."io.containerd.grpc.v1.cri".containerd]

  [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

    [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
      runtime_type = "io.containerd.runsc.v1"

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
        TypeUrl = "io.containerd.runsc.v1.options"
        ConfigPath = "/etc/containerd/runsc.toml"

    [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
      runtime_type = "io.containerd.runc.v2"

    [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runsc]
      runtime_type = "io.containerd.runsc.v1"

[plugins."io.containerd.runtime.v1.linux"] shim_debug = true

- `/etc/containerd/runsc.toml`:

log_path = "/var/log/runsc/%ID%/shim.log" log_level = "debug" binary_name = "/usr/bin/nvidia-container-runtime" [runsc_config] debug = "true" strace = "true" debug-log = "/var/log/runsc/%ID%/gvisor.%COMMAND%.log" nvproxy = "true"

- `/etc/nvidia-container-runtime/config.toml`:

accept-nvidia-visible-devices-as-volume-mounts = false

accept-nvidia-visible-devices-envvar-when-unprivileged = true

disable-require = false supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,video"

swarm-resource = "DOCKER_RESOURCE_GPU"

[nvidia-container-cli]

debug = "/var/log/nvidia-container-toolkit.log"

environment = []

ldcache = "/etc/ld.so.cache"

ldconfig = "@/sbin/ldconfig" load-kmods = true

no-cgroups = false

path = "/usr/bin/nvidia-container-cli"

root = "/run/nvidia/driver"

user = "root:video"

[nvidia-container-runtime] debug = "/var/log/nvidia-container-runtime.log" log-level = "debug" mode = "legacy" runtimes = ["runsc", "runc"]

[nvidia-container-runtime.modes]

[nvidia-container-runtime.modes.cdi] annotation-prefixes = ["cdi.k8s.io/"] default-kind = "nvidia.com/gpu" spec-dirs = ["/etc/cdi", "/var/run/cdi"]

[nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime-hook] path = "nvidia-container-runtime-hook" skip-mode-detection = false

[nvidia-ctk] path = "nvidia-ctk"



I will update https://github.com/google/gvisor/pull/11060 to mention this.
PSKP-95 commented 2 weeks ago

Hi @ayushr2, Thank you so much! It's working now. I have a question though—don't you think the binary_name in the file /etc/containerd/runsc.toml could also affect non-GPU workloads? Shouldn't it be in /etc/containerd/config.toml instead?

PSKP-95 commented 2 weeks ago

Got it now. Keeping BinaryName in /etc/containerd/config.toml doesn't seem to work. Perhaps maintaining a separate gVisor configuration for GPU workloads is the solution.

ayushr2 commented 2 weeks ago

Yeah, You can have multiple toml config files for runsc. Here is how you could set up containerd:

version = 2

[plugins]
  [plugins."io.containerd.grpc.v1.cri"]
    [plugins."io.containerd.grpc.v1.cri".containerd]
      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
          runtime_type = "io.containerd.runsc.v1"
          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
            TypeUrl = "io.containerd.runsc.v1.options"
            ConfigPath = "/etc/containerd/runsc_gpu.toml"

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runsc]
          runtime_type = "io.containerd.runsc.v1"
          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runsc.options]
            TypeUrl = "io.containerd.runsc.v1.options"
            ConfigPath = "/etc/containerd/runsc.toml"

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          runtime_type = "io.containerd.runc.v2"

  [plugins."io.containerd.runtime.v1.linux"]
    shim_debug = true

I also realized that you don't even need nvidia-container-runtime. Just use runsc runtime directly (instead of nvidia) and pass the --nvproxy-docker flag. nvidia-container-runtime is just adding a prestart hook. --nvproxy-docker flag does the same thing.