ilovepancakes95 / idrac_snmp-grafana

SNMP Based Dashboard to Monitor Dell Hosts via iDRAC
https://grafana.com/grafana/dashboards/12106
Other
139 stars 36 forks source link

gathering table idrac-hosts: performing bulk walk for field bios-version: request timeout - IDRAC9 #27

Closed jeevadotnet closed 1 year ago

jeevadotnet commented 1 year ago

I've installed the dashboard as per the readme file and getting the following issues: 1) Dashboard is empty 2) tailing the journalctl -u telegraf -f shows: gathering table idrac-hosts: performing bulk walk for field bios-version: request timeout

Sep 26 13:27:40 A-08-32-mon telegraf[775207]: 2022-09-26T11:27:40Z E! [inputs.snmp] Error in plugin: agent 10.102.72.53:161: gathering table idrac-hosts: performing bulk walk for field bios-version: request timeout (after 3 retries)
Sep 26 13:27:40 A-08-32-mon telegraf[775207]: 2022-09-26T11:27:40Z E! [inputs.snmp] Error in plugin: agent 10.102.72.38:161: gathering table idrac-hosts: performing bulk walk for field bios-version: request timeout (after 3 retries)

awk '!/^ *#/ && NF' telegraf.conf

[agent]
  interval = "180s"
  round_interval = true
  metric_batch_size = 1000
  metric_buffer_limit = 10000
  collection_jitter = "0s"
  flush_interval = "180s"
  flush_jitter = "0s"
  precision = "0s"
  hostname = ""
  omit_hostname = false
[[outputs.influxdb]]
[[processors.regex]]
  [[processors.regex.fields]]
    key = "log-dates"
    pattern = "^(?P<YYYY>\\d{4})(?P<MM>\\d{2})(?P<DD>\\d{2})(?P<HH>\\d{2})(?P<mm>\\d{2})(?P<ss>\\d{2})\\.(?P<SSSSSS>\\d{6})(?P<ZZ>[-+]\\d{3,4})$"
    replacement = "${YYYY}-${MM}-${DD} ${HH}:${mm}:${ss}"
[[inputs.snmp]]
  agents = [ "10.102.72.38:161" , "10.102.72.53:161" , "10.102.72.55:161" , "10.102.72.44:161" ]
  version = 1
  community = "public"
  name = "idrac-hosts"
  [[inputs.snmp.field]]
     name = "system-name"
     oid  = ".1.3.6.1.2.1.1.5.0"
     is_tag = true
  [[inputs.snmp.field]]
     name = "system-osname"
     oid  = ".1.3.6.1.4.1.674.10892.5.1.3.6.0"
  [[inputs.snmp.field]]
     name = "system-osversion"
     oid  = ".1.3.6.1.4.1.674.10892.5.1.3.14.0"
  [[inputs.snmp.field]]
     name = "system-model"
     oid  = ".1.3.6.1.4.1.674.10892.5.1.3.12.0"
  [[inputs.snmp.field]]
     name = "idrac-url"
     oid  = ".1.3.6.1.4.1.674.10892.5.1.1.6.0"
  [[inputs.snmp.field]]
     name = "power-state"
     oid  = ".1.3.6.1.4.1.674.10892.5.2.4.0"
  [[inputs.snmp.field]]
     name = "system-uptime"
     oid  = ".1.3.6.1.4.1.674.10892.5.2.5.0"
  [[inputs.snmp.field]]
     name = "system-servicetag"
     oid  = ".1.3.6.1.4.1.674.10892.5.1.3.2.0"
  [[inputs.snmp.field]]
     name = "system-globalstatus"
     oid  = ".1.3.6.1.4.1.674.10892.5.2.1.0"
  [[inputs.snmp.table]]
     name = "idrac-hosts"
     inherit_tags = [ "system-name" , "disks-name" ]
    [[inputs.snmp.table.field]]
       name = "bios-version"
       oid = ".1.3.6.1.4.1.674.10892.5.4.300.50.1.8"
    [[inputs.snmp.table.field]]
       name = "raid-batterystate"
       oid = ".1.3.6.1.4.1.674.10892.5.5.1.20.130.15.1.4"
    [[inputs.snmp.table.field]]
       name = "intrusion-sensor"
       oid = ".1.3.6.1.4.1.674.10892.5.4.300.70.1.6"
    [[inputs.snmp.table.field]]
       name = "disks-mediatype"
       oid = ".1.3.6.1.4.1.674.10892.5.5.1.20.130.4.1.35"
    [[inputs.snmp.table.field]]
       name = "disks-state"
       oid = ".1.3.6.1.4.1.674.10892.5.5.1.20.130.4.1.4"
    [[inputs.snmp.table.field]]
       name = "disks-predictivefail"
       oid = ".1.3.6.1.4.1.674.10892.5.5.1.20.130.4.1.31"
    [[inputs.snmp.table.field]]
       name = "disks-capacity"
       oid = ".1.3.6.1.4.1.674.10892.5.5.1.20.130.4.1.11"
    [[inputs.snmp.table.field]]
       name = "disks-name"
       oid = ".1.3.6.1.4.1.674.10892.5.5.1.20.130.4.1.2"
       is_tag = true
    [[inputs.snmp.table.field]]
       name = "memory-status"
       oid = ".1.3.6.1.4.1.674.10892.5.4.200.10.1.27"
    [[inputs.snmp.table.field]]
       name = "storage-status"
       oid = ".1.3.6.1.4.1.674.10892.5.2.3"
    [[inputs.snmp.table.field]]
       name = "temp-status"
       oid = ".1.3.6.1.4.1.674.10892.5.4.200.10.1.63"
    [[inputs.snmp.table.field]]
       name = "psu-status"
       oid = ".1.3.6.1.4.1.674.10892.5.4.200.10.1.9"
    [[inputs.snmp.table.field]]
       name = "log-dates"
       oid = ".1.3.6.1.4.1.674.10892.5.4.300.40.1.8"
    [[inputs.snmp.table.field]]
       name = "log-entry"
       oid = ".1.3.6.1.4.1.674.10892.5.4.300.40.1.5"
    [[inputs.snmp.table.field]]
       name = "log-severity"
       oid = ".1.3.6.1.4.1.674.10892.5.4.300.40.1.7"
    [[inputs.snmp.table.field]]
       name = "log-number"
       oid = ".1.3.6.1.4.1.674.10892.5.4.300.40.1.2"
       is_tag = true
    [[inputs.snmp.table.field]]
       name = "nic-name"
       oid = ".1.3.6.1.4.1.674.10892.5.4.1100.90.1.30"
       is_tag = true
    [[inputs.snmp.table.field]]
       name = "nic-vendor"
       oid = ".1.3.6.1.4.1.674.10892.5.4.1100.90.1.7"
    [[inputs.snmp.table.field]]
       name = "nic-status"
       oid = ".1.3.6.1.4.1.674.10892.5.4.1100.90.1.4"
    [[inputs.snmp.table.field]]
       name = "nic-current_mac"
       oid = ".1.3.6.1.4.1.674.10892.5.4.1100.90.1.15"
       conversion = "hwaddr"
  [[inputs.snmp.field]]
     name = "fan1-speed"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.1"
  [[inputs.snmp.field]]
     name = "fan2-speed"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.2"
  [[inputs.snmp.field]]
     name = "fan3-speed"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.3"
  [[inputs.snmp.field]]
     name = "fan4-speed"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.4"
  [[inputs.snmp.field]]
     name = "fan5-speed"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.5"
  [[inputs.snmp.field]]
     name = "fan6-speed"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.6"
  [[inputs.snmp.field]]
     name = "inlet-temp"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1"
  [[inputs.snmp.field]]
     name = "exhaust-temp"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2"
  [[inputs.snmp.field]]
     name = "cpu1-temp"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3"
  [[inputs.snmp.field]]
     name = "cpu2-temp"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4"
  [[inputs.snmp.field]]
     name = "cmos-batterystate"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.600.50.1.6.1.1"
  [[inputs.snmp.field]]
     name = "system-watts"
     oid  = ".1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.3"
[[inputs.cpu]]
  percpu = true
  totalcpu = true
  collect_cpu_time = false
  report_active = false
  core_tags = false
[[inputs.disk]]
  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
[[inputs.mem]]
[[inputs.net]]
[[inputs.netstat]]
[[inputs.swap]]
[[inputs.system]]

Influx Version: InfluxDB v1.8.10 Telegraf Version: Telegraf 1.24.1 Operating System: Ubuntu 20.04.4 LTS Grafana: 9.06 IDRAC: 9

ilovepancakes95 commented 1 year ago

Hello, first off, I see you are using iDRAC v9. I have not tested with v9 ever so not even sure if the SNMP fields and OIDs are the same. Although, it seems your error is simply telegraph not being able to connect to those 2 specific IPs and ports. Are you sure you have the "public" SNMP enabled on those iDRACs and available at those IPs and port? Is there a firewall or something blocking communication perhaps?

jeevadotnet commented 1 year ago

Hello, first off, I see you are using iDRAC v9. I have not tested with v9 ever so not even sure if the SNMP fields and OIDs are the same. Although, it seems your error is simply telegraph not being able to connect to those 2 specific IPs and ports. Are you sure you have the "public" SNMP enabled on those iDRACs and available at those IPs and port? Is there a firewall or something blocking communication perhaps?

Hi, I came right in the meantime.

We had a network misconfiguration between our monitoring box and the IDRAC VLAN, so that sorted out the timeouts. Because snmpwalk & snmpget works fine but the udp://161 wasn't translating through.

Regarding IDRAC9, I've been systematically been working through the Version 10 reference guide to match up the OID's with the GEN15. A couple of items missing in the latest version such as system-watts which is the main reason why I came across your dashboard & telegraf/influxdb setup. Still playing around with that to see how I can accurately get out wattage. for e.g. I have 2x exactly the same servers, one will work correctly with your @ilovepancakes95 system-watts OID, the other one not. Which is a bit baffling.