Closed realies closed 5 years ago
Can you run and paste the output of this:
/usr/bin/nvidia-smi --format=noheader,nounits,csv --query-gpu=fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video
# /usr/bin/nvidia-smi --format=noheader,nounits,csv --query-gpu=fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video
0, 4040, 0, 4040, P0, 53, GeForce GTX 1050 Ti, GPU-9083fca5-d6f0-3cab-6ae9-15d6e49d5b87, Default, 0, 0, 0, [Not Supported], 3, 8, 0, 0, 0, 1366, 1366, 3504, 1240
Strange it's reported as Not Supported
, running nvidia-smi stats
updates pwrDraw
regularly.
I wonder if it is in the xml output, can you run nvidia-smi -q -x
and attach the output?
# nvidia-smi -q -x
<?xml version="1.0" ?>
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v10.dtd">
<nvidia_smi_log>
<timestamp>Tue Aug 27 05:05:21 2019</timestamp>
<driver_version>430.14</driver_version>
<cuda_version>10.2</cuda_version>
<attached_gpus>1</attached_gpus>
<gpu id="00000000:2D:00.0">
<product_name>GeForce GTX 1050 Ti</product_name>
<product_brand>GeForce</product_brand>
<display_mode>Disabled</display_mode>
<display_active>Disabled</display_active>
<persistence_mode>Disabled</persistence_mode>
<accounting_mode>Disabled</accounting_mode>
<accounting_mode_buffer_size>4000</accounting_mode_buffer_size>
<driver_model>
<current_dm>N/A</current_dm>
<pending_dm>N/A</pending_dm>
</driver_model>
<serial>N/A</serial>
<uuid>GPU-9083fca5-d6f0-3cab-6ae9-15d6e49d5b87</uuid>
<minor_number>0</minor_number>
<vbios_version>86.07.39.00.54</vbios_version>
<multigpu_board>No</multigpu_board>
<board_id>0x2d00</board_id>
<gpu_part_number>N/A</gpu_part_number>
<inforom_version>
<img_version>G001.0000.01.04</img_version>
<oem_object>1.1</oem_object>
<ecc_object>N/A</ecc_object>
<pwr_object>N/A</pwr_object>
</inforom_version>
<gpu_operation_mode>
<current_gom>N/A</current_gom>
<pending_gom>N/A</pending_gom>
</gpu_operation_mode>
<gpu_virtualization_mode>
<virtualization_mode>None</virtualization_mode>
</gpu_virtualization_mode>
<ibmnpu>
<relaxed_ordering_mode>N/A</relaxed_ordering_mode>
</ibmnpu>
<pci>
<pci_bus>2D</pci_bus>
<pci_device>00</pci_device>
<pci_domain>0000</pci_domain>
<pci_device_id>1C8210DE</pci_device_id>
<pci_bus_id>00000000:2D:00.0</pci_bus_id>
<pci_sub_system_id>372A1458</pci_sub_system_id>
<pci_gpu_link_info>
<pcie_gen>
<max_link_gen>3</max_link_gen>
<current_link_gen>3</current_link_gen>
</pcie_gen>
<link_widths>
<max_link_width>16x</max_link_width>
<current_link_width>8x</current_link_width>
</link_widths>
</pci_gpu_link_info>
<pci_bridge_chip>
<bridge_chip_type>N/A</bridge_chip_type>
<bridge_chip_fw>N/A</bridge_chip_fw>
</pci_bridge_chip>
<replay_counter>0</replay_counter>
<replay_rollover_counter>0</replay_rollover_counter>
<tx_util>0 KB/s</tx_util>
<rx_util>0 KB/s</rx_util>
</pci>
<fan_speed>0 %</fan_speed>
<performance_state>P0</performance_state>
<clocks_throttle_reasons>
<clocks_throttle_reason_gpu_idle>Not Active</clocks_throttle_reason_gpu_idle>
<clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
<clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
<clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
<clocks_throttle_reason_hw_thermal_slowdown>Not Active</clocks_throttle_reason_hw_thermal_slowdown>
<clocks_throttle_reason_hw_power_brake_slowdown>Not Active</clocks_throttle_reason_hw_power_brake_slowdown>
<clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
<clocks_throttle_reason_sw_thermal_slowdown>Not Active</clocks_throttle_reason_sw_thermal_slowdown>
<clocks_throttle_reason_display_clocks_setting>Not Active</clocks_throttle_reason_display_clocks_setting>
</clocks_throttle_reasons>
<fb_memory_usage>
<total>4040 MiB</total>
<used>0 MiB</used>
<free>4040 MiB</free>
</fb_memory_usage>
<bar1_memory_usage>
<total>256 MiB</total>
<used>2 MiB</used>
<free>254 MiB</free>
</bar1_memory_usage>
<compute_mode>Default</compute_mode>
<utilization>
<gpu_util>2 %</gpu_util>
<memory_util>0 %</memory_util>
<encoder_util>0 %</encoder_util>
<decoder_util>0 %</decoder_util>
</utilization>
<encoder_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</encoder_stats>
<fbc_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</fbc_stats>
<ecc_mode>
<current_ecc>N/A</current_ecc>
<pending_ecc>N/A</pending_ecc>
</ecc_mode>
<ecc_errors>
<volatile>
<single_bit>
<device_memory>N/A</device_memory>
<register_file>N/A</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>N/A</l2_cache>
<texture_memory>N/A</texture_memory>
<texture_shm>N/A</texture_shm>
<cbu>N/A</cbu>
<total>N/A</total>
</single_bit>
<double_bit>
<device_memory>N/A</device_memory>
<register_file>N/A</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>N/A</l2_cache>
<texture_memory>N/A</texture_memory>
<texture_shm>N/A</texture_shm>
<cbu>N/A</cbu>
<total>N/A</total>
</double_bit>
</volatile>
<aggregate>
<single_bit>
<device_memory>N/A</device_memory>
<register_file>N/A</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>N/A</l2_cache>
<texture_memory>N/A</texture_memory>
<texture_shm>N/A</texture_shm>
<cbu>N/A</cbu>
<total>N/A</total>
</single_bit>
<double_bit>
<device_memory>N/A</device_memory>
<register_file>N/A</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>N/A</l2_cache>
<texture_memory>N/A</texture_memory>
<texture_shm>N/A</texture_shm>
<cbu>N/A</cbu>
<total>N/A</total>
</double_bit>
</aggregate>
</ecc_errors>
<retired_pages>
<multiple_single_bit_retirement>
<retired_count>N/A</retired_count>
<retired_pagelist>N/A</retired_pagelist>
</multiple_single_bit_retirement>
<double_bit_retirement>
<retired_count>N/A</retired_count>
<retired_pagelist>N/A</retired_pagelist>
</double_bit_retirement>
<pending_blacklist>N/A</pending_blacklist>
</retired_pages>
<temperature>
<gpu_temp>56 C</gpu_temp>
<gpu_temp_max_threshold>102 C</gpu_temp_max_threshold>
<gpu_temp_slow_threshold>99 C</gpu_temp_slow_threshold>
<gpu_temp_max_gpu_threshold>N/A</gpu_temp_max_gpu_threshold>
<memory_temp>N/A</memory_temp>
<gpu_temp_max_mem_threshold>N/A</gpu_temp_max_mem_threshold>
</temperature>
<power_readings>
<power_state>P0</power_state>
<power_management>Supported</power_management>
<power_draw>N/A</power_draw>
<power_limit>120.00 W</power_limit>
<default_power_limit>120.00 W</default_power_limit>
<enforced_power_limit>120.00 W</enforced_power_limit>
<min_power_limit>52.50 W</min_power_limit>
<max_power_limit>150.00 W</max_power_limit>
</power_readings>
<clocks>
<graphics_clock>1366 MHz</graphics_clock>
<sm_clock>1366 MHz</sm_clock>
<mem_clock>3504 MHz</mem_clock>
<video_clock>1240 MHz</video_clock>
</clocks>
<applications_clocks>
<graphics_clock>N/A</graphics_clock>
<mem_clock>N/A</mem_clock>
</applications_clocks>
<default_applications_clocks>
<graphics_clock>N/A</graphics_clock>
<mem_clock>N/A</mem_clock>
</default_applications_clocks>
<max_clocks>
<graphics_clock>1987 MHz</graphics_clock>
<sm_clock>1987 MHz</sm_clock>
<mem_clock>3504 MHz</mem_clock>
<video_clock>1708 MHz</video_clock>
</max_clocks>
<max_customer_boost_clocks>
<graphics_clock>N/A</graphics_clock>
</max_customer_boost_clocks>
<clock_policy>
<auto_boost>N/A</auto_boost>
<auto_boost_default>N/A</auto_boost_default>
</clock_policy>
<supported_clocks>N/A</supported_clocks>
<processes>
</processes>
<accounted_processes>
</accounted_processes>
</gpu>
</nvidia_smi_log>
I looked around and I think you might be running into this issue:
Sounds like power measurements for nvidia-smi
aren't going to be enabled anytime soon. Can there be a workaround use-case for 1050 Ti cards that reads pwrDraw
from nvidia-smi stats
I don't think we would want to run two commands, especially if the actual data is somewhat unreliable. However, it would be fairly easy to create an script for the exec plugin that adds this data.
I'm going to close this issue since I don't think we will take any action in Telegraf.
Running Telegraf in container based on latest https://hub.docker.com/_/telegraf/
Relevant telegraf.conf:
System info:
unraid 6.7.2 with nvidia driver version 430.14
Steps to reproduce:
Expected behavior:
Have a power_draw field
Actual behavior:
Not having a power_draw field
Additional info:
But nvidia-smi stats contains info about pwrDraw :/