AnalogJ / scrutiny

Hard Drive S.M.A.R.T Monitoring, Historical Trends & Real World Failure Thresholds
MIT License
5.07k stars 165 forks source link

[BUG] scrutiny collector incorrectly identifies a drive as not having smart capabilites and ignores the metrics smart data #640

Open sn3ak opened 4 months ago

sn3ak commented 4 months ago

Describe the bug I have scrutiny working well and multiple hosts. On this host it sends the nvme information to create an entry in the dashboard but claims the drive doesn't have smart support and appears to ignore the smart data that is sent after. The dashboard has no information for the nvme drive.

Expected behavior smart data to be published for the drive

Screenshots Screenshot 2024-05-15 143537

Log Files /opt/scrutiny/bin/scrutiny-collector-metrics-linux-amd64 run --api-endpoint "http://10.0.0.47:8080" --config /opt/scrutiny/bin/collector.yaml --debug

2024/05/15 14:38:23 No configuration file found at /opt/scrutiny/config/collector.yaml. Using Defaults.

 ___   ___  ____  __  __  ____  ____  _  _  _  _
/ __) / __)(  _ \(  )(  )(_  _)(_  _)( \( )( \/ )
\__ \( (__  )   / )(__)(   )(   _)(_  )  (  \  /
(___/ \___)(_)\_)(______) (__) (____)(_)\_) (__)
AnalogJ/scrutiny/metrics                        linux.amd64-0.8.1

2024/05/15 14:38:23 Loading configuration file: /opt/scrutiny/bin/collector.yaml
DEBU[0000] {
        "api": {
                "endpoint": "http://10.0.0.47:8080/"
        },
        "commands": {
                "metrics_info_args": "--info --json",
                "metrics_scan_args": "--scan --json",
                "metrics_smart_args": "--xall --json",
                "metrics_smartctl_bin": "smartctl"
        },
        "devices": [],
        "host": {
                "id": "dockerhost"
        },
        "log": {
                "file": "",
                "level": "DEBUG"
        },
        "version": 1
}<nil>  type=metrics
INFO[0000] Verifying required tools                      type=metrics
INFO[0000] Executing command: smartctl --scan --json     type=metrics
{
  "json_format_version": [
    1,
    0
  ],
  "smartctl": {
    "version": [
      7,
      3
    ],
    "svn_revision": "5338",
    "platform_info": "x86_64-linux-6.5.13-5-pve",
    "build_info": "(local build)",
    "argv": [
      "smartctl",
      "--scan",
      "--json"
    ],
    "exit_status": 0
  },
  "devices": [
    {
      "name": "/dev/nvme0",
      "info_name": "/dev/nvme0",
      "type": "nvme",
      "protocol": "NVMe"
    }
  ]
}
INFO[0000] Executing command: smartctl --info --json --device nvme /dev/nvme0  type=metrics
{
  "json_format_version": [
    1,
    0
  ],
  "smartctl": {
    "version": [
      7,
      3
    ],
    "svn_revision": "5338",
    "platform_info": "x86_64-linux-6.5.13-5-pve",
    "build_info": "(local build)",
    "argv": [
      "smartctl",
      "--info",
      "--json",
      "--device",
      "nvme",
      "/dev/nvme0"
    ],
    "exit_status": 0
  },
  "local_time": {
    "time_t": 1715809104,
    "asctime": "Wed May 15 14:38:24 2024 PDT"
  },
  "device": {
    "name": "/dev/nvme0",
    "info_name": "/dev/nvme0",
    "type": "nvme",
    "protocol": "NVMe"
  },
  "model_name": "SK hynix PC401 HFS256GD9TNG-62A0A",
  "serial_number": "EI82N045010803D5J",
  "firmware_version": "80000E00",
  "nvme_pci_vendor": {
    "id": 7260,
    "subsystem_id": 7260
  },
  "nvme_ieee_oui_identifier": 11330606,
  "nvme_controller_id": 1,
  "nvme_version": {
    "string": "1.2.1",
    "value": 66049
  },
  "nvme_number_of_namespaces": 1,
  "nvme_namespaces": [
    {
      "id": 1,
      "size": {
        "blocks": 500118192,
        "bytes": 256060514304
      },
      "capacity": {
        "blocks": 500118192,
        "bytes": 256060514304
      },
      "utilization": {
        "blocks": 82316912,
        "bytes": 42146258944
      },
      "formatted_lba_size": 512,
      "eui64": {
        "oui": 11330606,
        "ext_id": 555930308083
      }
    }
  ],
  "user_capacity": {
    "blocks": 500118192,
    "bytes": 256060514304
  },
  "logical_block_size": 512,
  "smart_support": {
    "available": true,
    "enabled": true
  }
}
INFO[0000] Using WWN Fallback                            type=metrics
DEBU[0000] WWN is empty, falling back to serial number: EI82N045010803D5J  type=metrics
INFO[0000] Sending detected devices to API, for filtering & validation  type=metrics
DEBU[0000] Detected devices: [{"wwn":"ei82n045010803d5j","device_name":"nvme0","device_uuid":"","device_serial_id":"","device_label":"","manufacturer":"","model_name":"SK hynix PC401 HFS256GD9TNG-62A0A","interface_type":"","interface_speed":"","serial_number":"EI82N045010803D5J","firmware":"80000E00","rotational_speed":0,"capacity":256060514304,"form_factor":"","smart_support":false,"device_protocol":"NVMe","device_type":"nvme","label":"","host_id":"dockerhost"}]  type=metrics
DEBU[0000] &{true [] [{ei82n045010803d5j nvme0     SK hynix PC401 HFS256GD9TNG-62A0A   EI82N045010803D5J 80000E00 0 256060514304  false NVMe nvme  dockerhost}]}  type=metrics
INFO[0000] Collecting smartctl results for nvme0         type=metrics
INFO[0000] Executing command: smartctl --xall --json --device nvme /dev/nvme0  type=metrics
{
  "json_format_version": [
    1,
    0
  ],
  "smartctl": {
    "version": [
      7,
      3
    ],
    "svn_revision": "5338",
    "platform_info": "x86_64-linux-6.5.13-5-pve",
    "build_info": "(local build)",
    "argv": [
      "smartctl",
      "--xall",
      "--json",
      "--device",
      "nvme",
      "/dev/nvme0"
    ],
    "uint128_precision_bits": 128,
    "exit_status": 0
  },
  "local_time": {
    "time_t": 1715809104,
    "asctime": "Wed May 15 14:38:24 2024 PDT"
  },
  "device": {
    "name": "/dev/nvme0",
    "info_name": "/dev/nvme0",
    "type": "nvme",
    "protocol": "NVMe"
  },
  "model_name": "SK hynix PC401 HFS256GD9TNG-62A0A",
  "serial_number": "EI82N045010803D5J",
  "firmware_version": "80000E00",
  "nvme_pci_vendor": {
    "id": 7260,
    "subsystem_id": 7260
  },
  "nvme_ieee_oui_identifier": 11330606,
  "nvme_controller_id": 1,
  "nvme_version": {
    "string": "1.2.1",
    "value": 66049
  },
  "nvme_number_of_namespaces": 1,
  "nvme_namespaces": [
    {
      "id": 1,
      "size": {
        "blocks": 500118192,
        "bytes": 256060514304
      },
      "capacity": {
        "blocks": 500118192,
        "bytes": 256060514304
      },
      "utilization": {
        "blocks": 82316912,
        "bytes": 42146258944
      },
      "formatted_lba_size": 512,
      "eui64": {
        "oui": 11330606,
        "ext_id": 555930308083
      }
    }
  ],
  "user_capacity": {
    "blocks": 500118192,
    "bytes": 256060514304
  },
  "logical_block_size": 512,
  "smart_support": {
    "available": true,
    "enabled": true
  },
  "smart_status": {
    "passed": true,
    "nvme": {
      "value": 0
    }
  },
  "nvme_smart_health_information_log": {
    "critical_warning": 0,
    "temperature": 38,
    "available_spare": 96,
    "available_spare_threshold": 5,
    "percentage_used": 1,
    "data_units_read": 2941826,
    "data_units_written": 5163533,
    "host_reads": 25369949,
    "host_writes": 37221747,
    "controller_busy_time": 6044,
    "power_cycles": 613,
    "power_on_hours": 256,
    "unsafe_shutdowns": 32,
    "media_errors": 79228162514264337593543950348,
    "media_errors_s": "79228162514264337593543950348",
    "media_errors_le": [
      12,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      1
    ],
    "num_err_log_entries": 0,
    "warning_temp_time": 0,
    "critical_comp_time": 0,
    "temperature_sensors": [
      38,
      43
    ]
  },
  "temperature": {
    "current": 38
  },
  "power_cycle_count": 613,
  "power_on_time": {
    "hours": 256
  }
}
INFO[0000] Publishing smartctl results for ei82n045010803d5j  type=metrics
INFO[0000] Main: Completed                               type=metrics

Here I trimmed the output to what I believe the problem is:

  "smart_support": {
    "available": true,
    "enabled": true
  }
}
INFO[0000] Using WWN Fallback                            type=metrics
DEBU[0000] WWN is empty, falling back to serial number: EI82N045010803D5J  type=metrics
INFO[0000] Sending detected devices to API, for filtering & validation  type=metrics
DEBU[0000] Detected devices: [{"wwn":"ei82n045010803d5j","device_name":"nvme0","device_uuid":"","device_serial_id":"","device_label":"","manufacturer":"","model_name":"SK hynix PC401 HFS256GD9TNG-62A0A","interface_type":"","interface_speed":"","serial_number":"EI82N045010803D5J","firmware":"80000E00","rotational_speed":0,"capacity":256060514304,"form_factor":"","smart_support":false,

smartctl is correctly returning that the nvme has smart available and that it is enabled. However scrutiny set smart_support to false. It then appears to ignore the next part which includes the smart data for the drive.

sn3ak commented 4 months ago

Here's the logs from the server:

/opt/scrutiny/bin/scrutiny-web-freebsd-amd64 start --debug

2024/05/15 15:30:54 Loading configuration file: /opt/scrutiny/config/scrutiny.yaml

 ___   ___  ____  __  __  ____  ____  _  _  _  _
/ __) / __)(  _ \(  )(  )(_  _)(_  _)( \( )( \/ )
\__ \( (__  )   / )(__)(   )(   _)(_  )  (  \  /
(___/ \___)(_)\_)(______) (__) (____)(_)\_) (__)
github.com/AnalogJ/scrutiny                   freebsd.amd64-0.8.1

Start the scrutiny server
DEBU[0000] {"log":{"file":"","level":"DEBUG"},"notify":{"urls":["generic://ha.mydomain.net/api/webhook/nope?template=json","pushover://shouldrrr:nope/"]},"version":1,"web":{"database":{"location":"/opt/scrutiny/config/scrutiny.db"},"influxdb":{"bucket":"metrics","host":"10.0.0.48","init_password":"nope","init_username":"admin","org":"scrutiny","port":8086,"retention_policy":true,"scheme":"http","tls":{"insecure_skip_verify":false},"token":"scrutiny-default-admin-token"},"listen":{"basepath":"","host":"0.0.0.0","port":"8080"},"src":{"frontend":{"path":"/opt/scrutiny/web"}}}}<nil>  type=web
[GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
 - using env:   export GIN_MODE=release
 - using code:  gin.SetMode(gin.ReleaseMode)

INFO[0000] Trying to connect to scrutiny sqlite db: /opt/scrutiny/config/scrutiny.db  type=web
INFO[0000] Successfully connected to scrutiny sqlite db: /opt/scrutiny/config/scrutiny.db  type=web
DEBU[0000] InfluxDB url: http://10.0.0.48:8086           type=web
INFO[0000] InfluxDB certificate verification: true       type=web
DEBU[0000] Determine Influxdb setup status...            type=web
INFO[0000] Database migration starting. Please wait, this process may take a long time....  type=web
INFO[0000] Database migration completed successfully     type=web
INFO[0000] SQLite global configuration migrations starting. Please wait....  type=web
INFO[0000] SQLite global configuration migrations completed successfully  type=web
DEBU[0000] basepath:                                     type=web
[GIN-debug] GET    /api/health               --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.HealthCheck (5 handlers)
[GIN-debug] POST   /api/health/notify        --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.SendTestNotification (5 handlers)
[GIN-debug] POST   /api/devices/register     --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.RegisterDevices (5 handlers)
[GIN-debug] GET    /api/summary              --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.GetDevicesSummary (5 handlers)
[GIN-debug] GET    /api/summary/temp         --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.GetDevicesSummaryTempHistory (5 handlers)
[GIN-debug] POST   /api/device/:wwn/smart    --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.UploadDeviceMetrics (5 handlers)
[GIN-debug] POST   /api/device/:wwn/selftest --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.UploadDeviceSelfTests (5 handlers)
[GIN-debug] GET    /api/device/:wwn/details  --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.GetDeviceDetails (5 handlers)
[GIN-debug] DELETE /api/device/:wwn          --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.DeleteDevice (5 handlers)
[GIN-debug] GET    /api/settings             --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.GetSettings (5 handlers)
[GIN-debug] POST   /api/settings             --> github.com/analogj/scrutiny/webapp/backend/pkg/web/handler.SaveSettings (5 handlers)
[GIN-debug] GET    /web/*filepath            --> github.com/gin-gonic/gin.(*RouterGroup).createStaticHandler.func1 (5 handlers)
[GIN-debug] HEAD   /web/*filepath            --> github.com/gin-gonic/gin.(*RouterGroup).createStaticHandler.func1 (5 handlers)
[GIN-debug] GET    /                         --> github.com/analogj/scrutiny/webapp/backend/pkg/web.(*AppEngine).Setup.func1 (5 handlers)
[GIN-debug] Listening and serving HTTP on 0.0.0.0:8080
INFO[0004] 10.0.0.3 - scrutiny.nope.lcl [15/May/2024:15:30:59 -0700] "POST /api/devices/register" 200 581 "" "Go-http-client/1.1" (3ms)  clientIP=10.0.0.3 hostname=scrutiny.nope.lcl latency=3 method=POST path=/api/devices/register referer= respLength=581 statusCode=200 type=web userAgent=Go-http-client/1.1
DEBU[0004] {"data":[{"wwn":"ei82n045010803d5j","device_name":"nvme0","device_uuid":"","device_serial_id":"","device_label":"","manufacturer":"","model_name":"SK hynix PC401 HFS256GD9TNG-62A0A","interface_type":"","interface_speed":"","serial_number":"EI82N045010803D5J","firmware":"80000E00","rotational_speed":0,"capacity":256060514304,"form_factor":"","smart_support":false,"device_protocol":"NVMe","device_type":"nvme","label":"","host_id":"dockerhost"}]}  bodyType=request clientIP=10.0.0.3 hostname=scrutiny.nope.lcl latency=3 method=POST path=/api/devices/register referer= respLength=581 statusCode=200 type=web userAgent=Go-http-client/1.1
DEBU[0004] {"success":true,"errors":null,"data":[{"CreatedAt":"0001-01-01T00:00:00Z","UpdatedAt":"0001-01-01T00:00:00Z","DeletedAt":null,"wwn":"ei82n045010803d5j","device_name":"nvme0","device_uuid":"","device_serial_id":"","device_label":"","manufacturer":"","model_name":"SK hynix PC401 HFS256GD9TNG-62A0A","interface_type":"","interface_speed":"","serial_number":"EI82N045010803D5J","firmware":"80000E00","rotational_speed":0,"capacity":256060514304,"form_factor":"","smart_support":false,"device_protocol":"NVMe","device_type":"nvme","label":"","host_id":"dockerhost","device_status":0}]}  bodyType=response clientIP=10.0.0.3 hostname=scrutiny.nope.lcl latency=3 method=POST path=/api/devices/register referer= respLength=581 statusCode=200 type=web userAgent=Go-http-client/1.1
ERRO[0004] Cannot parse SMART data json: cannot unmarshal number 79228162514264337593543950348 into Go struct field NvmeSmartHealthInformationLog.nvme_smart_health_information_log.media_errors of type int64  type=web
[GIN-debug] [WARNING] Headers were already written. Wanted to override status code 400 with 500
ERRO[0004]                                               clientIP=10.0.0.3 hostname=scrutiny.nope.lcl latency=4 method=POST path=/api/device/ei82n045010803d5j/smart referer= respLength=17 statusCode=500 type=web userAgent=Go-http-client/1.1
DEBU[0004] {
  "json_format_version": [
    1,
    0
  ],
  "smartctl": {
    "version": [
      7,
      3
    ],
    "svn_revision": "5338",
    "platform_info": "x86_64-linux-6.5.13-5-pve",
    "build_info": "(local build)",
    "argv": [
      "smartctl",
      "--xall",
      "--json",
      "--device",
      "nvme",
      "/dev/nvme0"
    ],
    "uint128_precision_bits": 128,
    "exit_status": 0
  },
  "local_time": {
    "time_t": 1715812259,
    "asctime": "Wed May 15 15:30:59 2024 PDT"
  },
  "device": {
    "name": "/dev/nvme0",
    "info_name": "/dev/nvme0",
    "type": "nvme",
    "protocol": "NVMe"
  },
  "model_name": "SK hynix PC401 HFS256GD9TNG-62A0A",
  "serial_number": "EI82N045010803D5J",
  "firmware_version": "80000E00",
  "nvme_pci_vendor": {
    "id": 7260,
    "subsystem_id": 7260
  },
  "nvme_ieee_oui_identifier": 11330606,
  "nvme_controller_id": 1,
  "nvme_version": {
    "string": "1.2.1",
    "value": 66049
  },
  "nvme_number_of_namespaces": 1,
  "nvme_namespaces": [
    {
      "id": 1,
      "size": {
        "blocks": 500118192,
        "bytes": 256060514304
      },
      "capacity": {
        "blocks": 500118192,
        "bytes": 256060514304
      },
      "utilization": {
        "blocks": 82316912,
        "bytes": 42146258944
      },
      "formatted_lba_size": 512,
      "eui64": {
        "oui": 11330606,
        "ext_id": 555930308083
      }
    }
  ],
  "user_capacity": {
    "blocks": 500118192,
    "bytes": 256060514304
  },
  "logical_block_size": 512,
  "smart_support": {
    "available": true,
    "enabled": true
  },
  "smart_status": {
    "passed": true,
    "nvme": {
      "value": 0
    }
  },
  "nvme_smart_health_information_log": {
    "critical_warning": 0,
    "temperature": 38,
    "available_spare": 96,
    "available_spare_threshold": 5,
    "percentage_used": 1,
    "data_units_read": 2941827,
    "data_units_written": 5164988,
    "host_reads": 25369949,
    "host_writes": 37221747,
    "controller_busy_time": 6046,
    "power_cycles": 613,
    "power_on_hours": 256,
    "unsafe_shutdowns": 32,
    "media_errors": 79228162514264337593543950348,
    "media_errors_s": "79228162514264337593543950348",
    "media_errors_le": [
      12,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      1
    ],
    "num_err_log_entries": 0,
    "warning_temp_time": 0,
    "critical_comp_time": 0,
    "temperature_sensors": [
      38,
      44
    ]
  },
  "temperature": {
    "current": 38
  },
  "power_cycle_count": 613,
  "power_on_time": {
    "hours": 256
  }
}  bodyType=request clientIP=10.0.0.3 hostname=scrutiny.nope.lcl latency=4 method=POST path=/api/device/ei82n045010803d5j/smart referer= respLength=17 statusCode=500 type=web userAgent=Go-http-client/1.1
DEBU[0004] {"success":false}                             bodyType=response clientIP=10.0.0.3 hostname=scrutiny.nope.lcl latency=4 method=POST path=/api/device/ei82n045010803d5j/smart referer= respLength=17 statusCode=500 type=web userAgent=Go-http-client/1.1

So, it seems to be failing due to the fact media errors has a number that is too long?