Closed fsmv closed 1 year ago
What's the driver version? GPU model?
I have nvidia-driver
version 525.116.03
The GPU is RTX 3060
Does /dev/nvidia1
actually exist?
It does.
% ls -l /compat/linux/dev/nvidia1
crw-rw-rw- 1 root wheel 0x71 Aug 24 23:10 /com
pat/linux/dev/nvidia1
% ls -l /dev/nvidia1
crw-rw-rw- 1 root wheel 0x71 Aug 24 23:10 /dev
/nvidia1
Would you mind checking d545115c27d19300f2f0ef6ba5c95f02af6648f4?
Too bad I got busy and couldn't check the patch earlier. Is this supposed to be fixed in the 20230916 tag? I've just installed that version and I got the same output (previously I had 20230629 apparently).
I checked the work file and I do have the code in the commit you mentioned and I'm still seeing no device found. @shkhln I should have more time to help debug now.
Here's the new log but I think it's the same:
% SHIM_DEBUG=1 nv-sglrun nvidia-smi
shim init
[10059:144091] shim_getpid()
[10059:144091] shim_getpid -> 10059
[10059:144091] shim_getenv("__NVML_DBG_LVL")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_getenv("__NVML_DBG_APPEND")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_getenv("__NVML_DBG_FILE")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_gettimeofday(0x82d0bd410, 0x0)
[10059:144091] shim_gettimeofday -> 0
[10059:144091] shim_memset(0x82c4c5ba0, 0, 12509464)
[10059:144091] shim_memset -> 0x82c4c5ba0
[10059:144091] shim_getenv("__NVML_CRAY_PSTATE")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_getenv("__NVIDIA_NVML_3373")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_getenv("__NVML_ONLY_DAEMON_PERSISTENCE_MODE")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_getenv("__RM_ENABLE_VERBOSE_OUTPUT")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_fopen("/proc/modules", "r")
[10059:144091] shim_fopen -> 0x0
[10059:144091] shim___xstat(1, "/sys/bus/pci/devices", 0x820870950)
[10059:144091] shim___xstat -> -1
[10059:144091] shim___errno_location()
[10059:144091] shim___errno_location -> 0x822c70e10
[10059:144091] shim_geteuid()
[10059:144091] shim_geteuid -> 1001
[10059:144091] shim_getenv("__RM_ENABLE_VERBOSE_OUTPUT")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim___xstat(1, "/usr/bin/nvidia-modprobe", 0x820870e70)
[10059:144091] shim___xstat -> -1
[10059:144091] shim_fopen("/proc/driver/nvidia/params", "r")
[10059:144091] shim_fopen -> 0x822c7c4f0
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 2
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 1
[10059:144091] shim_fclose(0x822c7c4f0)
[10059:144091] shim_fclose -> 0
[10059:144091] shim_snprintf(0x820870bb0, 128, "/dev/char/%d:%d", ...)
[10059:144091] shim_snprintf -> 17
[10059:144091] shim___xstat(1, "/dev/nvidiactl", 0x820870d40)
[10059:144091] shim___xstat -> 0
[10059:144091] shim_snprintf(0x820870c30, 128, "../%s", ...)
[10059:144091] shim_snprintf -> 12
[10059:144091] shim_remove("/dev/char/195:255")
[10059:144091] shim_remove -> -1
[10059:144091] shim_symlink("../nvidiactl", "/dev/char/195:255")
[10059:144091] shim_symlink -> -1
[10059:144091] shim___xstat(1, "/dev/char/195:255", 0x820870cb0)
[10059:144091] shim___xstat -> -1
[10059:144091] shim___errno_location()
[10059:144091] shim___errno_location -> 0x822c70e10
[10059:144091] shim_snprintf(0x820870ee0, 32, "-c=%d", ...)
[10059:144091] shim_snprintf -> 6
[10059:144091] shim_getenv("__RM_ENABLE_VERBOSE_OUTPUT")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim___xstat(1, "/usr/bin/nvidia-modprobe", 0x820870e30)
[10059:144091] shim___xstat -> -1
[10059:144091] shim_fopen("/proc/driver/nvidia/params", "r")
[10059:144091] shim_fopen -> 0x822c7c4f0
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 2
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 1
[10059:144091] shim_fclose(0x822c7c4f0)
[10059:144091] shim_fclose -> 0
[10059:144091] shim___xstat(1, "/dev/nvidiactl", 0x820870d70)
[10059:144091] shim___xstat -> 0
[10059:144091] shim_open64("/dev/nvidiactl", 2, ...)
[10059:144091] shim_open64 -> 5
[10059:144091] shim_fcntl(5, 2, ...)
[10059:144091] shim_fcntl_impl: cmd = F_SETFD, arg = 0x1
[10059:144091] shim_fcntl -> 0
[10059:144091] shim_getenv("__RM_NO_VERSION_CHECK")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_ioctl(5, 0xc04846d2, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_open("/sys/devices/system/memory/block_size_bytes", 0, ...)
[10059:144091] shim_open -> -1
[10059:144091] shim___errno_location()
[10059:144091] shim___errno_location -> 0x822c70e10
[10059:144091] shim_ioctl(5, 0xc90046c8, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc020462b, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_fopen("/proc/devices", "r")
[10059:144091] shim_fopen -> 0x0
[10059:144091] shim_snprintf(0x8208707e0, 260, "-f=%s", ...)
[10059:144091] shim_snprintf -> 46
[10059:144091] shim___xstat(1, "/usr/bin/nvidia-modprobe", 0x820870600)
[10059:144091] shim___xstat -> -1
[10059:144091] shim_fopen("/proc/devices", "r")
[10059:144091] shim_fopen -> 0x0
[10059:144091] shim_fopen("/proc/driver/nvidia/capabilities/mig/config", "r")
[10059:144091] shim_fopen -> 0x0
[10059:144091] shim___xstat(1, "", 0x820870530)
[10059:144091] shim___xstat -> -1
[10059:144091] shim_fopen("/proc/devices", "r")
[10059:144091] shim_fopen -> 0x0
[10059:144091] shim_snprintf(0x8208707e0, 260, "-f=%s", ...)
[10059:144091] shim_snprintf -> 47
[10059:144091] shim___xstat(1, "/usr/bin/nvidia-modprobe", 0x820870600)
[10059:144091] shim___xstat -> -1
[10059:144091] shim_fopen("/proc/devices", "r")
[10059:144091] shim_fopen -> 0x0
[10059:144091] shim_fopen("/proc/driver/nvidia/capabilities/mig/monitor", "r")
[10059:144091] shim_fopen -> 0x0
[10059:144091] shim___xstat(1, "", 0x820870530)
[10059:144091] shim___xstat -> -1
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc020462a, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc020462a, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_qsort(0x8208712a0, 1, 12, 0x82c1c55e0)
[10059:144091] shim_qsort -> void
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc020462a, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc020462a, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_snprintf(0x820870450, 128, "/dev/nvidia%d", ...)
[10059:144091] shim_snprintf -> 12
[10059:144091] shim_fopen("/proc/driver/nvidia/params", "r")
[10059:144091] shim_fopen -> 0x822c7c4f0
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 2
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 1
[10059:144091] shim_fclose(0x822c7c4f0)
[10059:144091] shim_fclose -> 0
[10059:144091] shim_snprintf(0x8208701c0, 128, "/dev/char/%d:%d", ...)
[10059:144091] shim_snprintf -> 15
[10059:144091] shim___xstat(1, "/dev/nvidia1", 0x820870350)
[10059:144091] shim___xstat -> 0
[10059:144091] shim_snprintf(0x820870240, 128, "../%s", ...)
[10059:144091] shim_snprintf -> 10
[10059:144091] shim_remove("/dev/char/195:1")
[10059:144091] shim_remove -> -1
[10059:144091] shim_symlink("../nvidia1", "/dev/char/195:1")
[10059:144091] shim_symlink -> -1
[10059:144091] shim___xstat(1, "/dev/char/195:1", 0x8208702c0)
[10059:144091] shim___xstat -> -1
[10059:144091] shim___errno_location()
[10059:144091] shim___errno_location -> 0x822c70e10
[10059:144091] shim_snprintf(0x8208704f0, 32, "-c=%d", ...)
[10059:144091] shim_snprintf -> 4
[10059:144091] shim_getenv("__RM_ENABLE_VERBOSE_OUTPUT")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim___xstat(1, "/usr/bin/nvidia-modprobe", 0x820870440)
[10059:144091] shim___xstat -> -1
[10059:144091] shim_snprintf(0x820870450, 128, "/dev/nvidia%d", ...)
[10059:144091] shim_snprintf -> 12
[10059:144091] shim_fopen("/proc/driver/nvidia/params", "r")
[10059:144091] shim_fopen -> 0x822c7c4f0
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 2
[10059:144091] shim___isoc99_fscanf(0x822c7c4f0, "%31[^:]: %u
", ...)
[10059:144091] shim___isoc99_fscanf -> 1
[10059:144091] shim_fclose(0x822c7c4f0)
[10059:144091] shim_fclose -> 0
[10059:144091] shim___xstat(1, "/dev/nvidia1", 0x820870380)
[10059:144091] shim___xstat -> 0
[10059:144091] shim_getenv("__RM_ENABLE_VERBOSE_OUTPUT")
[10059:144091] shim_getenv -> 0x0
[10059:144091] shim_memset(0x82086ee80, 0, 8576)
[10059:144091] shim_memset -> 0x82086ee80
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc020462a, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_qsort(0x82086ed00, 0, 12, 0x82c1c55e0)
[10059:144091] shim_qsort -> void
[10059:144091] shim_calloc(1544, 1)
[10059:144091] shim_calloc -> 0x827eb9700
[10059:144091] shim_getpid()
[10059:144091] shim_getpid -> 10059
No devices were found
[10059:144091] shim_getpid()
[10059:144091] shim_getpid -> 10059
[10059:144091] shim_free(0x827eb9700)
[10059:144091] shim_free -> void
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc020462a, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_time(0x0)
[10059:144091] shim_time -> 1695078816
[10059:144091] shim_ioctl(5, 0xc0104629, ...)
[10059:144091] shim_ioctl -> 0
[10059:144091] shim_close(5)
[10059:144091] shim_close -> 0
[10059:144091] shim_memset(0x82c4c5ba0, 0, 12509464)
[10059:144091] shim_memset -> 0x82c4c5ba0
[10059:144091] shim___cxa_finalize(0x82c4c5600)
[10059:144091] shim___cxa_finalize -> void
Is this supposed to be fixed in the 20230916 tag? I've just installed that version
There is no installation procedure/script in the repo.
% SHIM_DEBUG=1 nv-sglrun nvidia-smi
Is the proper version actually in $PATH?
I installed it by editing the makefile in /usr/ports/. I changed the version to that tag and updated the checksums and did make install.
Proper version of nv-sglrun or something else?
% which nv-sglrun
/usr/local/bin/nv-sglrun
% pkg which /usr/local/bin/nv-sglrun
/usr/local/bin/nv-sglrun was installed by package libc6-shim-20230916
Perhaps we need different ids for different nvidia%d
nodes: make_dev_id(195, 0)
for nvidia0
, make_dev_id(195, 1)
for nvidia1
and so on. Want to try your hand at patching that?
It worked! Awesome! You just need to add an atoi call to this to parse the device ID.
This is the patch I used
--- src/libc/sys/stat.c.orig 2023-09-19 04:38:42 UTC
+++ src/libc/sys/stat.c
@@ -114,7 +114,7 @@ static uint64_t make_dev_id(uint32_t major, uint32_t m
switch (path[sizeof("/dev/nvidia") - 1]) { \
case 'c': stat_buf->st_rdev = make_dev_id(195, 255); break; \
case '-': stat_buf->st_rdev = make_dev_id(195, 254); break; \
- default: stat_buf->st_rdev = make_dev_id(195, 0); \
+ default: stat_buf->st_rdev = make_dev_id(195, 1); \
} \
}
ππππππ
% nv-sglrun nvidia-smi
shim init
Mon Sep 18 21:41:49 2023
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.03 Driver Version: 525.116.03 CUDA Version: 12.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:65:00.0 Off | N/A |
| 53% 41C P0 33W / 170W | 0MiB / 12288MiB | 1% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Oh well, fixed by 3764b213b76826abd77bc275afd1faff3eb89236.
I'm able to use https://gist.github.com/shkhln/40ef290463e78fb2b0000c60f4ad797e to load pytorch via linux-miniconda-installer and when I run /compat/linux/bin/nvidia-smi it finds CUDA but when I run nvidia-smi with nv-sglrun it finds no devices (also plain nvidia-smi works with no cuda).
I'm using version 20230629 and I'm running FreeBSD 13.2-RELEASE-p2
Here's the log: