Closed awarm closed 5 years ago
if you're going to run it that way, you must remove libnvidia-ml.so.418.87
from the /work/cuda/nvml_fix/
directory. otherwise it will load the stub and fail as you are seeing. also you shouldn't need the CFLAGS="-I ..."
if you're running the latest version - nvml_v9.h
is included.
root@work-pc-linux:/work/cuda/nvml_fix# make TARGET_VER=418.87
gcc -shared -fPIC -s empty.c -o libnvidia-ml.so.418.87
gcc -Wl,--no-as-needed -shared -fPIC -s -o libnvidia-ml.so.1 -DNVML_PATCH_418 -DNVML_VERSION=\"418.87\" libnvidia-ml.so.418.87 nvml_fix.c
root@work-pc-linux:/work/cuda/nvml_fix# rm libnvidia-ml.so.418.87
root@work-pc-linux:/work/cuda/nvml_fix# ls
empty.c Makefile nvml_v3.h README.md
libnvidia-ml.so.1 nvml_fix.c nvml_v9.h
root@work-pc-linux:/work/cuda/nvml_fix# LD_LIBRARY_PATH=pwd
nvidia-smi
NVIDIA-SMI couldn't find libnvidia-ml.so library in your system. Please make sure that the NVIDIA Display Driver is properly installed and present in your system.
Please also try adding directory that contains libnvidia-ml.so to your system PATH.
where is libnvidia-ml.so.418.87 installed on your system? are you using ubuntu-provided or nvidia-provided drivers?
did you already try make install TARGET_VER=418.87
?
what is the output of: LD_LIBRARY_PATH=`pwd` strace nvidia-smi 2>&1|fgrep libnvidia
?
OK... NVIDIA-SMI 418.87.00 Driver Version: 418.87.00 CUDA Version: 10.1
root@work-pc-linux:/work/cuda/nvml_fix# make TARGET_VER=418.87.00
gcc -shared -fPIC -s empty.c -o libnvidia-ml.so.418.87.00
gcc -Wl,--no-as-needed -shared -fPIC -s -o libnvidia-ml.so.1 -DNVML_PATCH_418 -DNVML_VERSION=\"418.87.00\" libnvidia-ml.so.418.87.00 nvml_fix.c
root@work-pc-linux:/work/cuda/nvml_fix# rm libnvidia-ml.so.
libnvidia-ml.so.1 libnvidia-ml.so.418.87.00
root@work-pc-linux:/work/cuda/nvml_fix# rm libnvidia-ml.so.418.87.00
root@work-pc-linux:/work/cuda/nvml_fix# LD_LIBRARY_PATH=`pwd` nvidia-smi
Tue Sep 3 11:46:18 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.00 Driver Version: 418.87.00 CUDA Version: 10.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GT 730 On | 00000000:01:00.0 On | N/A |
| 40% 42C P8 N/A / N/A | 723MiB / 1998MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 Not Supported |
+-----------------------------------------------------------------------------+
root@work-pc-linux:/work/cuda/nvml_fix# LD_LIBRARY_PATH=`pwd` strace nvidia-smi 2>&1|fgrep libnvidia
openat(AT_FDCWD, "/work/cuda/nvml_fix/libnvidia-ml.so.1", O_RDONLY|O_CLOEXEC) = 3
openat(AT_FDCWD, "/work/cuda/nvml_fix/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/tls/x86_64/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/tls/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/tls/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/tls/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/x86_64/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/tls/x86_64/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/tls/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/tls/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/tls/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/x86_64/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/x86_64/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.418.87.00", O_RDONLY|O_CLOEXEC) = 3
openat(AT_FDCWD, "/work/cuda/nvml_fix/libnvidia-fatbinaryloader.so.418.87.00", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/libnvidia-fatbinaryloader.so.418.87.00", O_RDONLY|O_CLOEXEC) = 7
after make install output not changes
some versions of gt730 use an nvidia GF108 chip, which is fermi-based. if that is the case then the card itself does not support power reporting. do you know the exact model number of your card?
GK208B [GeForce GT 730] version a1 bus 64 bits
GIGABYTE GV-N730-2GI
ok i think that should be capable, not certain though.
can you get this info?
with nvml_fix disabled/uninstalled (e.g.: rm /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1; ln -s libnvidia-ml.so.418.87.00 /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1
)
try: nvidia-smi -q > before.txt
then reinstall: (e.g. make install TARGET_VER=418.87.00
)
then: nvidia-smi -q > after.txt
and then: diff -uN before.txt after.txt
.
are there any differences between the two? can you paste the diff
output please?
--- before.txt 2019-09-03 12:47:40.253359210 +0300
+++ after.txt 2019-09-03 12:50:19.945515288 +0300
@@ -1,19 +1,19 @@
==============NVSMI LOG==============
-Timestamp : Tue Sep 3 12:47:40 2019
+Timestamp : Tue Sep 3 12:50:19 2019
Driver Version : 418.87.00
CUDA Version : 10.1
Attached GPUs : 1
GPU 00000000:01:00.0
Product Name : GeForce GT 730
- Product Brand : GeForce
- Display Mode : N/A
- Display Active : N/A
+ Product Brand : Quadro
+ Display Mode : Enabled
+ Display Active : Enabled
Persistence Mode : Enabled
- Accounting Mode : N/A
- Accounting Mode Buffer Size : N/A
+ Accounting Mode : Disabled
+ Accounting Mode Buffer Size : 4000
Driver Model
Current : N/A
Pending : N/A
@@ -21,8 +21,8 @@
GPU UUID : GPU-5cc24a5e-4216-4b85-96b0-bcaf313a16d7
Minor Number : 0
VBIOS Version : 80.28.78.00.26
- MultiGPU Board : N/A
- Board ID : N/A
+ MultiGPU Board : No
+ Board ID : 0x100
GPU Part Number : N/A
Inforom Version
Image Version : N/A
@@ -33,7 +33,7 @@
Current : N/A
Pending : N/A
GPU Virtualization Mode
- Virtualization mode : N/A
+ Virtualization mode : None
IBMNPU
Relaxed Ordering Mode : N/A
PCI
@@ -45,11 +45,11 @@
Sub System Id : 0x36661458
GPU Link Info
PCIe Generation
- Max : N/A
- Current : N/A
+ Max : 2
+ Current : 1
Link Width
- Max : N/A
- Current : N/A
+ Max : 8x
+ Current : 4x
Bridge Chip
Type : N/A
Firmware : N/A
@@ -58,30 +58,39 @@
Tx Throughput : N/A
Rx Throughput : N/A
Fan Speed : 40 %
- Performance State : P0
- Clocks Throttle Reasons : N/A
+ Performance State : P8
+ Clocks Throttle Reasons
+ Idle : Not Active
+ Applications Clocks Setting : Not Active
+ SW Power Cap : Not Active
+ HW Slowdown : Not Active
+ HW Thermal Slowdown : N/A
+ HW Power Brake Slowdown : N/A
+ Sync Boost : Not Active
+ SW Thermal Slowdown : Not Active
+ Display Clock Setting : Not Active
FB Memory Usage
Total : 1998 MiB
- Used : 777 MiB
- Free : 1221 MiB
+ Used : 778 MiB
+ Free : 1220 MiB
BAR1 Memory Usage
- Total : N/A
- Used : N/A
- Free : N/A
+ Total : 128 MiB
+ Used : 5 MiB
+ Free : 123 MiB
Compute Mode : Default
Utilization
- Gpu : N/A
- Memory : N/A
- Encoder : N/A
- Decoder : N/A
+ Gpu : 24 %
+ Memory : 33 %
+ Encoder : 0 %
+ Decoder : 0 %
Encoder Stats
- Active Sessions : N/A
- Average FPS : N/A
- Average Latency : N/A
+ Active Sessions : 0
+ Average FPS : 0
+ Average Latency : 0
FBC Stats
- Active Sessions : N/A
- Average FPS : N/A
- Average Latency : N/A
+ Active Sessions : 0
+ Average FPS : 0
+ Average Latency : 0
Ecc Mode
Current : N/A
Pending : N/A
@@ -129,9 +138,9 @@
Double Bit ECC : N/A
Pending Page Blacklist : N/A
Temperature
- GPU Current Temp : 44 C
- GPU Shutdown Temp : N/A
- GPU Slowdown Temp : N/A
+ GPU Current Temp : 43 C
+ GPU Shutdown Temp : 102 C
+ GPU Slowdown Temp : 97 C
GPU Max Operating Temp : N/A
Memory Current Temp : N/A
Memory Max Operating Temp : N/A
@@ -144,21 +153,21 @@
Min Power Limit : N/A
Max Power Limit : N/A
Clocks
+ Graphics : 193 MHz
+ SM : 193 MHz
+ Memory : 405 MHz
+ Video : 405 MHz
+ Applications Clocks
Graphics : N/A
- SM : N/A
Memory : N/A
- Video : N/A
- Applications Clocks
- Graphics : 901 MHz
- Memory : 900 MHz
Default Applications Clocks
- Graphics : 901 MHz
- Memory : 900 MHz
- Max Clocks
Graphics : N/A
- SM : N/A
Memory : N/A
- Video : N/A
+ Max Clocks
+ Graphics : 901 MHz
+ SM : 901 MHz
+ Memory : 900 MHz
+ Video : 540 MHz
Max Customer Boost Clocks
Graphics : N/A
Clock Policy
thanks. so nvml_fix is "working" in that it is successfully telling the driver/nvidia-smi that the card is a "quadro" and supports power management etc. i believe the failure here is one or more of the nvidia chip, the card's vbios, or the driver, not returning power information. unfortunately i do not think there is much that can be done within the scope of the nvml_fix shim :( sorry.
I'm need process list. Power informarion not important for me.
ah, well, the same applies for that, i think.
one last thing to try is to run something that'll use nvidia opengl. i found that after first rebooting, i would not be able to see power utilization and processes. but as soon as i opened chrome (or even glxgears
worked), i would see the processes and power.
Unfortunaly process list always 'Not supported' :(
root@work-pc-linux:/work/cuda# git clone https://github.com/CFSworks/nvml_fix/ Клонирование в «nvml_fix»… remote: Enumerating objects: 29, done. remote: Counting objects: 100% (29/29), done. remote: Compressing objects: 100% (17/17), done. remote: Total 96 (delta 15), reused 25 (delta 12), pack-reused 67 Распаковка объектов: 100% (96/96), готово. root@work-pc-linux:/work/cuda# cd nvml_fix root@work-pc-linux:/work/cuda/nvml_fix# make TARGET_VER=418.87 CFLAGS="-I /usr/local/cuda-10.1/include/" gcc -I /usr/local/cuda-10.1/include/ -shared -fPIC -s empty.c -o libnvidia-ml.so.418.87 gcc -I /usr/local/cuda-10.1/include/ -Wl,--no-as-needed -shared -fPIC -s -o libnvidia-ml.so.1 -DNVML_PATCH_418 -DNVML_VERSION=\"418.87\" libnvidia-ml.so.418.87 nvml_fix.c root@work-pc-linux:/work/cuda/nvml_fix# LD_LIBRARY_PATH=
pwd
nvidia-smi Failed to initialize NVML: Unknown Error Failed to properly shut down NVML: Function Not Found