influxdata / kapacitor

Open source framework for processing, monitoring, and alerting on time series data
MIT License
2.32k stars 492 forks source link

Kapacitor restarting because of OOM #2157

Open shubham7saxena opened 5 years ago

shubham7saxena commented 5 years ago

our kapacitor keeps on restarting frequently because of out of memory issues. we have managed to understand that it is because of our join queries but cannot truly understand as to wherere and hence are unable to fix it. I see that this is a common problem with kapacitor from the already open issues but could not find anything that could help me out. Here's one of our scripts.

// alert_type = batch
var crit = 20
var warn = 15
var period = 3m
var every = 10s
var fill = 0.01
var production_database string

// Dataframe
var client_error = batch
  |query('''SELECT mean("value") from "''' + production_database  + '''"."autogen"./dummy_service.dummy.callback-.*.client-error.1MinuteRate/ where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./''')
    .period(period)
    .every(every)
    .groupBy(time(30s), 'actor')
    .fill(0)
  |where(lambda: "actor" =~ /dummy_service/ )

var response_timed_out = batch
  |query('''SELECT mean("value") from "''' + production_database  + '''"."autogen"./dummy_service.dummy.callback-.*.response-timed-out.1MinuteRate/ where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy(time(30s), 'actor')
    .fill(0)
  |where(lambda: "actor" =~ /dummy_service/ )

var failed_execution = batch
  |query('''SELECT mean("value") from "''' + production_database  + '''"."autogen"./dummy_service.dummy.callback-.*.failed-execution.1MinuteRate/ where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy(time(30s), 'actor')
    .fill(0)
  |where(lambda: "actor" =~ /dummy_service/ )

var circuit_open = batch
  |query('''SELECT mean("value") from "''' + production_database  + '''"."autogen"./dummy_service.dummy.callback-.*.circuit-open.1MinuteRate/ where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy(time(30s), 'actor')
    .fill(0)
  |where(lambda: "actor" =~ /dummy_service/ )

var success = batch
  |query('''SELECT mean("value") from "''' + production_database  + '''"."autogen"./dummy_service.dummy.callback-.*.success.1MinuteRate/ where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy(time(30s), 'actor')
    .fill(fill)
  |where(lambda: "actor" =~ /dummy_service/ )

// Client Error Rate Alert
success
  |join(client_error, response_timed_out, failed_execution, circuit_open)
    .as('success', 'client_error', 'response_timed_out', 'failed_execution', 'circuit_open')
  |eval(lambda: ("client_error.mean" / ("success.mean" + "client_error.mean" + "response_timed_out.mean" + "failed_execution.mean" + "circuit_open.mean"))*100.0)
    .as('error_rate')
  |influxDBOut()
    .create()
    .precision('us')
    .database(production_database)
    .measurement('dummy-service.client-error-percentage')

var client_error_rate_data = batch
  |query('''SELECT mean("error_rate") as stat from "''' + production_database  + '''"."autogen"."dummy-service.client-error-percentage" where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy('actor')
  |where(lambda: "actor" =~ /dummy_service/ )

// Thresholds
var client_error_rate_alert = client_error_rate_data
  |alert()
    .id('{{ index .Tags "actor"}}/{{ .Name }}')
    .message('{{ .ID }}:{{ index .Fields "stat" }}')
    .warn(lambda: "stat" >= warn)
    .crit(lambda: "stat" >= crit)

// Alert
client_error_rate_alert
  .log('/tmp/error_rate_alert_log.txt')
  .post('')

// Response Timed Out Rate Alert
success
  |join(client_error, response_timed_out, failed_execution, circuit_open)
    .as('success', 'client_error', 'response_timed_out', 'failed_execution', 'circuit_open')
  |eval(lambda: ("response_timed_out.mean" / ("success.mean" + "client_error.mean" + "response_timed_out.mean" + "failed_execution.mean" + "circuit_open.mean"))*100.0)
    .as('error_rate')
  |influxDBOut()
    .create()
    .precision('us')
    .database(production_database)
    .measurement('dummy-service.response-timed-out-percentage')

var response_timed_out_rate_data = batch
  |query('''SELECT mean("error_rate") as stat from "''' + production_database  + '''"."autogen"."dummy-service.response-timed-out-percentage" where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy('actor')
  |where(lambda: "actor" =~ /dummy_service/ )

// Thresholds
var response_timed_out_rate_alert = response_timed_out_rate_data
  |alert()
    .id('{{ index .Tags "actor"}}/{{ .Name }}')
    .message('{{ .ID }}:{{ index .Fields "stat" }}')
    .warn(lambda: "stat" >= warn)
    .crit(lambda: "stat" >= crit)

// Alert
response_timed_out_rate_alert
  .log('/tmp/error_rate_alert_log.txt')
  .post('some url')

// Failed Execution Rate Alert
success
  |join(client_error, response_timed_out, failed_execution, circuit_open)
    .as('success', 'client_error', 'response_timed_out', 'failed_execution', 'circuit_open')
  |eval(lambda: ("failed_execution.mean" / ("success.mean" + "client_error.mean" + "response_timed_out.mean" + "failed_execution.mean" + "circuit_open.mean"))*100.0)
    .as('error_rate')
  |influxDBOut()
    .create()
    .precision('us')
    .database(production_database)
    .measurement('dummy-service.failed-execution-percentage')

var failed_execution_rate_data = batch
  |query('''SELECT mean("error_rate") as stat from "''' + production_database  + '''"."autogen"."dummy-service.failed-execution-percentage" where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy('actor')
  |where(lambda: "actor" =~ /dummy_service/ )

// Thresholds
var failed_execution_rate_alert = failed_execution_rate_data
  |alert()
    .id('{{ index .Tags "actor"}}/{{ .Name }}')
    .message('{{ .ID }}:{{ index .Fields "stat" }}')
    .warn(lambda: "stat" >= warn)
    .crit(lambda: "stat" >= crit)

// Alert
failed_execution_rate_alert
  .log('/tmp/error_rate_alert_log.txt')

// Circuit Open Rate Alert
success
  |join(client_error, response_timed_out, failed_execution, circuit_open)
    .as('success', 'client_error', 'response_timed_out', 'failed_execution', 'circuit_open')
  |eval(lambda: ("circuit_open.mean" / ("success.mean" + "client_error.mean" + "response_timed_out.mean" + "failed_execution.mean" + "circuit_open.mean"))*100.0)
    .as('error_rate')
  |influxDBOut()
    .create()
    .flushInterval(10s)
    .precision('us')
    .database(production_database)
    .measurement('dummy-service.circuit-open-percentage')

var circuit_open_rate_data = batch
  |query('''SELECT mean("error_rate") as stat from "''' + production_database  + '''"."autogen"."dummy-service.circuit-open-percentage" where "actor" =~ /./ AND "kubernetes_cluster_name" !~ /./ ''')
    .period(period)
    .every(every)
    .groupBy('actor')
  |where(lambda: "actor" =~ /dummy_service/ )

// Thresholds
var circuit_open_rate_alert = circuit_open_rate_data
  |alert()
    .id('{{ index .Tags "actor"}}/{{ .Name }}')
    .message('{{ .ID }}:{{ index .Fields "stat" }}')
    .details('''{{json .}}|''' + production_database)
    .warn(lambda: "stat" >= warn)
    .crit(lambda: "stat" >= crit)

// Alert
circuit_open_rate_alert
  .log('/tmp/error_rate_alert_log.txt')
shubham7saxena commented 5 years ago

kapacitor version 1.4 running on ubuntu 16.04 with 16 Gigs of RAM.