influxdata / kapacitor

Open source framework for processing, monitoring, and alerting on time series data
MIT License
2.32k stars 492 forks source link

Kapacitor alerts collects false alerts. #2722

Open subodhdharma opened 2 years ago

subodhdharma commented 2 years ago

$ kapacitor version

Kapacitor OSS 1.6.4 (git: HEAD dfdea23b82343fca1976358b9d98cd8ec42e09df)

I am using the following tick script to determine uptime of the a node.

uptime.tick: |
    var period = 1m
    var every = 1m
    var warn = 300 // seconds
    var warnReset = 600 // seconds
    var node_down = stream
        |from()
            .measurement('uptime')
            .groupBy('*')
            .where(lambda: "type" == 'node')
        |deadman(0.0, 5m)
            .message('Node {{ index .Tags "nodename" }} is down')
            .stateChangesOnly()
            .email()
            .log('/var/lib/kapacitor/logs/node_down.log')
            .mode(0644)
    var uptime = stream
        |from()
            .measurement('uptime')
            .groupBy('nodename')
            .where(lambda: "type" == 'node')
        |window()
            .period(period)
            .every(every)
        |eval(lambda: ceil(float("value") / 1000.0))
            .as('uptime')
    var trigger = uptime
        |alert()
            .message('{{ .Level }} / Node {{ index .Tags "nodename" }} was rebooted')
            .warn(lambda: "uptime" < warn)
            .warnReset(lambda: "uptime" > warnReset)
            .stateChangesOnly()
            .details('''
    <b>{{ .Message }}</b>
    <p>Level: {{ .Level }}</p>
    <p>Nodename: {{ index .Tags "nodename" }}</p>
    <p>Uptime: {{ index .Fields "uptime" }} sec</p>
    ''')
            .email()
            .log('/var/lib/kapacitor/logs/uptime.log')
            .mode(0644)

The uptime alerts are triggered even when the nodes are not rebooted at all.

/ $ uptime
 04:36:24 up  5:20,  0 users,  load average: 1.33, 1.02, 0.64
/ $ kapacitor list topics
ID                                Level     Collected
main:docker:alert4                OK                0
main:etcd:alert11                 OK                0
main:etcd:alert12                 OK                0
main:etcd:alert4                  OK                0
main:etcd_latency_batch:alert8    OK                0
main:filesystem:alert16           OK                0
main:filesystem:alert8            OK                0
main:high_cpu:alert9              OK                0
main:high_memory:alert9           OK                0
main:influxdb_health_batch:alert5 OK                0
main:kubernetes_node:alert4       OK                0
main:networking_params:alert12    OK                0
main:networking_params:alert7     OK                0
main:networking_params:alert8     OK                0
main:systemd:alert4               OK                0
main:systemd:alert8               OK                0
main:uptime:alert5                OK                0
main:uptime:alert9                WARNING           5

/ $ kapacitor show-topic main:uptime:alert9
ID: main:uptime:alert9
Level: WARNING
Collected: 7
Handlers: []
Events:
Event                      Level    Message                                Date                   
uptime:nodename=10.1.0.124 WARNING  WARNING / Node 10.1.0.124 was rebooted 27 Sep 22 04:42 UTC

/ $ kapacitor show uptime
ID: uptime
Error: 
Template: 
Type: stream
Status: enabled
Executing: true
Created: 26 Sep 22 23:30 UTC
Modified: 26 Sep 22 23:30 UTC
LastEnabled: 26 Sep 22 23:30 UTC
Databases Retention Policies: ["k8s"."default"]
TICKscript:
var period = 1m

var every = 1m

var warn = 300

// seconds
var warnReset = 600

// seconds
var node_down = stream
    |from()
        .measurement('uptime')
        .groupBy('*')
        .where(lambda: "type" == 'node')
    |deadman(0.0, 5m)
        .message('Node {{ index .Tags "nodename" }} is down')
        .stateChangesOnly()
        .email()
        .log('/var/lib/kapacitor/logs/node_down.log')
        .mode(0644)

var uptime = stream
    |from()
        .measurement('uptime')
        .groupBy('nodename')
        .where(lambda: "type" == 'node')
    |window()
        .period(period)
        .every(every)
    |eval(lambda: ceil(float("value") / 1000.0))
        .as('uptime')

var trigger = uptime
    |alert()
        .message('{{ .Level }} / Node {{ index .Tags "nodename" }} was rebooted')
        .warn(lambda: "uptime" < warn)
        .warnReset(lambda: "uptime" > warnReset)
        .stateChangesOnly()
        .details('''
<b>{{ .Message }}</b>
<p>Level: {{ .Level }}</p>
<p>Nodename: {{ index .Tags "nodename" }}</p>
<p>Uptime: {{ index .Fields "uptime" }} sec</p>
''')
        .email()
        .log('/var/lib/kapacitor/logs/uptime.log')
        .mode(0644)

DOT:
digraph uptime {
graph [throughput="0.00 points/s"];

stream0 [avg_exec_time_ns="0s" errors="0" working_cardinality="0" ];
stream0 -> from6 [processed="43012"];
stream0 -> from1 [processed="43012"];

from6 [avg_exec_time_ns="5.072285ms" errors="0" working_cardinality="0" ];
from6 -> window7 [processed="314"];

window7 [avg_exec_time_ns="2.547µs" errors="0" working_cardinality="1" ];
window7 -> eval8 [processed="313"];

eval8 [avg_exec_time_ns="14.706µs" errors="0" working_cardinality="1" ];
eval8 -> alert9 [processed="313"];

alert9 [alerts_inhibited="0" alerts_triggered="7" avg_exec_time_ns="36.021µs" crits_triggered="0" errors="0" infos_triggered="0" oks_triggered="3" warns_triggered="4" working_cardinality="1" ];

from1 [avg_exec_time_ns="4.329µs" errors="0" working_cardinality="0" ];
from1 -> noop3 [processed="314"];

noop3 [avg_exec_time_ns="0s" errors="0" working_cardinality="0" ];

stats2 [avg_exec_time_ns="45.939µs" errors="0" working_cardinality="0" ];
stats2 -> derivative4 [processed="62"];

derivative4 [avg_exec_time_ns="4.158µs" errors="0" working_cardinality="1" ];
derivative4 -> alert5 [processed="61"];

alert5 [alerts_inhibited="0" alerts_triggered="0" avg_exec_time_ns="47.009µs" crits_triggered="0" errors="0" infos_triggered="0" oks_triggered="0" warns_triggered="0" working_cardinality="1" ];
}
/ $ 
docmerlin commented 2 years ago

@subodhdharma How are you creating the value being sent to kapacitor? It looks like you are alerting when that value in less than your specified value. This looks like the value you are sending may be something other than what you are looking for?