canonical / grafana-agent-k8s-operator

https://charmhub.io/grafana-agent-k8s
Apache License 2.0
8 stars 18 forks source link

Add alert rules for disk utilization #186

Closed rgildein closed 1 year ago

rgildein commented 1 year ago

Context

Moving disk NRPE checks from charm-nrpe.

Testing Instructions

Tested with

rule_files:
  - disk.rules

evaluation_interval: 1m

tests:
  # disk usage prediction
  - interval: 1m
    input_series:
      - series: 'used_disk_space{instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
        values: '10x60 20x60 30x60 40x60 50x60 60x60 70x60 80x60 90x60 100x60'
    promql_expr_test:
      - expr: used_disk_space
        eval_time: 1h
        exp_samples:
          - labels: '{__name__="used_disk_space", instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 10
      - expr: used_disk_space
        eval_time: 9h
        exp_samples:
          - labels: '{__name__="used_disk_space", instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 90
      - expr: predict_linear(used_disk_space{mountpoint=~"/"}[6h], 6*3600)
        eval_time: 4h
        exp_samples:
          - labels: '{instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 99.11525667844037
    alert_rule_test:
      - eval_time: 1h
        alertname: HostDiskSpaceFillsUp
        exp_alerts: []  # no alert
      - eval_time: 4h
        alertname: HostDiskSpaceFillsUp
        exp_alerts:
          - exp_labels:
              severity: warning
              mountpoint: /
              instance: test-model_1234_test-app_test-app/0
            exp_annotations:
              summary: "[Prediction] Host filesystem '/' is using 99% of the total space (instance test-model_1234_test-app_test-app/0)"
              description: >-
                Host filesystem '/' usage can potentially reach 99% of the total space.
                  VALUE = 99.11525667844037
                  LABELS = map[instance:test-model_1234_test-app_test-app/0 mountpoint:/]
                The 6-hour-ahead prediction is made as a linear regression from the last 60 minutes of data.
  # disk usage
  - interval: 1m
    input_series:
      - series: 'node_filesystem_free_bytes{instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
        values: '50 25 5 25 50'
      - series: 'node_filesystem_size_bytes_bytes{instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
        values: '100x5'
    promql_expr_test:
      - expr: used_disk_space
        eval_time: 0m
        exp_samples:
          - labels: '{__name__="used_disk_space", instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 50
      - expr: used_disk_space
        eval_time: 1m
        exp_samples:
          - labels: '{__name__="used_disk_space", instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 75
      - expr: used_disk_space
        eval_time: 2m
        exp_samples:
          - labels: '{__name__="used_disk_space", instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 95
    alert_rule_test:
      - eval_time: 1m
        alertname: HostDiskSpace
        exp_alerts: []  # no alert
      - eval_time: 2m
        alertname: HostDiskSpace
        exp_alerts:
          - exp_labels:
              severity: critical
              mountpoint: /
              instance: test-model_1234_test-app_test-app/0
            exp_annotations:
              summary: Host filesystem '/' is using 95% of the total space (instance test-model_1234_test-app_test-app/0)
              description: >-
                Host filesystem '/' is using 95% of the total space.
                  VALUE = 95
                  LABELS = map[__name__:used_disk_space instance:test-model_1234_test-app_test-app/0 mountpoint:/]

  # read only filesystem
  - interval: 1m
    input_series:
      - series: 'node_filesystem_readonly{instance="test-model_1234_test-app_test-app/0", mountpoint="/snap/core22"}'
        values: '1x15'
      - series: 'node_filesystem_readonly{instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
        values: '0x9 1 0x5'
    promql_expr_test:
      - expr: node_filesystem_readonly
        eval_time: 5m
        exp_samples:
          - labels: 'node_filesystem_readonly{instance="test-model_1234_test-app_test-app/0", mountpoint="/snap/core22"}'
            value: 1
          - labels: 'node_filesystem_readonly{instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 0
      - expr: node_filesystem_readonly
        eval_time: 10m
        exp_samples:
          - labels: 'node_filesystem_readonly{instance="test-model_1234_test-app_test-app/0", mountpoint="/snap/core22"}'
            value: 1
          - labels: 'node_filesystem_readonly{instance="test-model_1234_test-app_test-app/0", mountpoint="/"}'
            value: 1   
    alert_rule_test:
      - eval_time: 5m
        alertname: HostReadonlyFilesystem
        exp_alerts: []  # no alert
      - eval_time: 10m
        alertname: HostReadonlyFilesystem
        exp_alerts:
          - exp_labels:
              severity: warning
              mountpoint: /
              instance: test-model_1234_test-app_test-app/0
            exp_annotations:
              summary: Host filesystem '/' is readonly (instance test-model_1234_test-app_test-app/0)
              description: >-
                Host filesystem '/' is readonly.
                  VALUE = 1
                  LABELS = map[__name__:node_filesystem_readonly instance:test-model_1234_test-app_test-app/0 mountpoint:/]

  # XFS device error
  - interval: 1m
    input_series:
      - series: 'node_filesystem_device_error{instance="test-model_1234_test-app_test-app/0", mountpoint="/var/data", fstype="xfs", device="/dev/mapper/vg01_xfs-lv01_xfs"}'
        values: '0 1 0'
      - series: 'node_filesystem_device_error{instance="test-model_1234_test-app_test-app/0", mountpoint="/var/data", fstype="ext4", device="/dev/root"}'
        values: '0 1 0'  
    alert_rule_test:
      - eval_time: 0m
        alertname: HostXFSError
        exp_alerts: []  # no alert
      - eval_time: 1m
        alertname: HostXFSError
        exp_alerts:
          - exp_labels:
              severity: critical
              mountpoint: /var/data
              fstype: xfs
              device: /dev/mapper/vg01_xfs-lv01_xfs
              instance: test-model_1234_test-app_test-app/0
            exp_annotations:
              summary: XFS error found for device '/dev/mapper/vg01_xfs-lv01_xfs' (instance test-model_1234_test-app_test-app/0)
              description: >-
                XFS error found for device '/dev/mapper/vg01_xfs-lv01_xfs'.
                  VALUE = 1
                  LABELS = map[__name__:node_filesystem_device_error device:/dev/mapper/vg01_xfs-lv01_xfs fstype:xfs instance:test-model_1234_test-app_test-app/0 mountpoint:/var/data]
  # high read/write rate
  - interval: 1m
    input_series:
      - series: 'node_disk_read_bytes_total{instance="test-model_1234_test-app_test-app/0", device="sdb"}'
        values: '0x10 3774873600+3774873600x10'  # 0 ... 0 60MB/s ... 60MB/s
      - series: 'node_disk_written_bytes_total{instance="test-model_1234_test-app_test-app/0", device="sdb"}'
        values: '0x10 3774873600+3774873600x10'  # 0 ... 0 60MB/s ... 60MB/s
    promql_expr_test:
      - expr:  irate(node_disk_read_bytes_total[2m]) / 1024 / 1024
        eval_time: 15m
        exp_samples:
          - labels: '{instance="test-model_1234_test-app_test-app/0", device="sdb"}'
            value: 60  # 60MB
      - expr:  irate(node_disk_written_bytes_total[2m]) / 1024 / 1024
        eval_time: 15m
        exp_samples:
          - labels: '{instance="test-model_1234_test-app_test-app/0", device="sdb"}'
            value: 60  # 60MB
    alert_rule_test:
      - eval_time: 15m
        alertname: HostHighDiskReadRate
        exp_alerts: []  # no alert
      - eval_time: 16m
        alertname: HostHighDiskReadRate
        exp_alerts:
          - exp_labels:
              severity: warning
              device: sdb
              instance: test-model_1234_test-app_test-app/0
            exp_annotations:
              summary: Host high disk 'sdb' read rate (instance test-model_1234_test-app_test-app/0)
              description: >-
                Host disk 'sdb' is probably reading too much data (60 > 50 MB/s) for last 5m.
                  VALUE = 60
                  LABELS = map[device:sdb instance:test-model_1234_test-app_test-app/0]
      - eval_time: 15m
        alertname: HostHighDiskWriteRate
        exp_alerts: []  # no alert
      - eval_time: 16m
        alertname: HostHighDiskWriteRate
        exp_alerts:
          - exp_labels:
              severity: warning
              device: sdb
              instance: test-model_1234_test-app_test-app/0
            exp_annotations:
              summary: Host high disk 'sdb' write rate (instance test-model_1234_test-app_test-app/0)
              description: >-
                Host disk 'sdb' is probably writing too much data (60 > 50 MB/s) for last 5m.
                  VALUE = 60
                  LABELS = map[device:sdb instance:test-model_1234_test-app_test-app/0]

and promtool

x1:➜  prometheus_alert_rules git:(nrpe/disk-aler-rules) ✗ promtool test rules ./test_disk.yaml
Unit Testing:  ./test_disk.yaml
  SUCCESS
                                                                                                  [0.47s]

Release Notes