Closed angadsingh closed 8 years ago
Can you share the configuration of the notification declaration? (Don't need real URLs or Emails), just want to see if there are chains etc...
Thanks for the quick reply @kylebrandt
Here's the complete config:
##################### basic settings #######################
stateFile = /opt/bosun/bosun.state
redisHost = localhost:6379
redisDb = 0
graphiteHost = http://graphitehost
checkFrequency=5m
#disable dashboard grouping
minGroupSize=10000
hostname=http://hostname
emailFrom = from@xyz.com
smtpHost = localhost:25
##################### Notifications #######################
notification foo-alerts-email {
email = alerts-group@xyz.com
next = foo-alerts-email
timeout = 2h
runOnActions = false
print = true
}
notification foo-service1-email {
email = alert.this.engineer@xyz.com,alert.that.engineer@xyz.com
next = foo-alerts-email
timeout = 2h
runOnActions = false
print = true
}
notification foo-service2-email {
email = alert.this.engineer@xyz.com,alert.that.engineer@xyz.com
next = foo-alerts-email
timeout = 2h
runOnActions = false
print = true
}
notification foo-service3-email {
email = alert.this.engineer@xyz.com,alert.that.engineer@xyz.com
next = foo-alerts-email
timeout = 2h
runOnActions = false
print = true
}
notification foo-service4-email {
email = alert.this.engineer@xyz.com,alert.that.engineer@xyz.com
next = foo-alerts-email
timeout = 2h
runOnActions = false
print = true
}
notification foo-service5-email {
email = alert.this.engineer@xyz.com,alert.that.engineer@xyz.com
next = foo-alerts-email
timeout = 2h
runOnActions = false
print = true
}
notification foo-infra-email {
email = alert.this.engineer@xyz.com,alert.that.engineer@xyz.com
next = foo-alerts-email
timeout = 2h
runOnActions = false
print = true
}
notification bpb-slack {
timeout = 1h
runOnActions = false
print = true
post = https://hooks.slack.com/services/ASDFASF/ASDFASETC/ASDFASDFASasfaasasf
body = {"text": {{.|json}}}
next = bpb-slack
}
notification foo-alerts-slack {
timeout = 2h
runOnActions = false
print = true
post = https://hooks.slack.com/services/ASDFASF/ASDFASETC/ASDFASDFASasfaasasf
body = {"text": {{.|json}}}
next = foo-alerts-slack
}
notification foo-alerts-warning-slack {
timeout = 5h
runOnActions = false
print = true
post = https://hooks.slack.com/services/ASDFASF/ASDFASETC/ASDFASDFASasfaasasf
body = {"text": {{.|json}}}
next = foo-alerts-warning-slack
}
notification foo-alerts-test-slack {
timeout = 5h
runOnActions = false
print = true
post = https://hooks.slack.com/services/ASDFASF/ASDFASETC/ASDFASDFASasfaasasf
body = {"text": {{.|json}}}
next = foo-alerts-test-slack
}
# TODO:
# try https://github.com/bosun-monitor/bosun/pull/1101
# use template body in pagerduty as well
notification foo-general-pagerduty {
post = https://events.pagerduty.com/generic/2010-04-15/create_event.json
contentType = application/json
next = foo-general-pagerduty
timeout = 30m
runOnActions = false
print = true
body = `{
"service_key": "sfgasdfasdafsdfa45124fasdfdas",
"incident_key": {{.|json}},
"event_type": "trigger",
"description": {{.|json}},
"client": "Bosun",
"client_url": "http://bosun.server.xyz.com:8070/"
}`
}
notification foo-service-pagerduty {
post = https://events.pagerduty.com/generic/2010-04-15/create_event.json
contentType = application/json
next = foo-service-pagerduty
timeout = 30m
runOnActions = false
print = true
body = `{
"service_key": "sfgasdfasdafsdfa45124fasdfdas",
"incident_key": {{.|json}},
"event_type": "trigger",
"description": {{.|json}},
"client": "Bosun",
"client_url": "http://bosun.server.xyz.com:8070/"
}`
}
notification foo-service3-pagerduty {
post = https://events.pagerduty.com/generic/2010-04-15/create_event.json
contentType = application/json
next = foo-service3-pagerduty
timeout = 30m
runOnActions = false
print = true
body = `{
"service_key": "sfgasdfasdafsdfa45124fasdfdas",
"incident_key": {{.|json}},
"event_type": "trigger",
"description": {{.|json}},
"client": "Bosun",
"client_url": "http://bosun.server.xyz.com:8070/"
}`
}
notification foo-ucm-pagerduty {
post = https://events.pagerduty.com/generic/2010-04-15/create_event.json
contentType = application/json
next = foo-ucm-pagerduty
timeout = 30m
runOnActions = false
print = true
body = `{
"service_key": "sfgasdfasdafsdfa45124fasdfdas",
"incident_key": {{.|json}},
"event_type": "trigger",
"description": {{.|json}},
"client": "Bosun",
"client_url": "http://bosun.server.xyz.com:8070/"
}`
}
##################### Templates #######################
template generic {
body = `{{template "header" .}}
{{template "def" .}}
{{template "tags" .}}
{{template "computation" .}}
{{template "graph" .}}`
subject = [bosun] {{.Last.Status}}: {{replace .Alert.Name "." " " -1}}: {{.Eval .Alert.Vars.q | printf "%.2f"}} {{if .Alert.Vars.unit_string}}{{.Alert.Vars.unit_string}}{{end}} in {{.Group}}
}
template def {
body = `<p><strong>Alert definition:</strong>
<table>
<tr>
<td>Name:</td>
<td>{{replace .Alert.Name "." " " -1}}</td></tr>
<tr>
<td>Warn:</td>
<td>{{.Alert.Warn}} {{if .Alert.Vars.unit_string}}{{.Alert.Vars.unit_string}}{{end}}</td></tr>
<tr>
<td>Crit:</td>
<td>{{.Alert.Crit}} {{if .Alert.Vars.unit_string}}{{.Alert.Vars.unit_string}}{{end}}</td></tr>
</table>`
}
template tags {
body = `<p><strong>Tags</strong>
<table>
{{range $k, $v := .Group}}
{{if eq $k "host"}}
<tr><td>{{$k}}</td><td><a href="{{$.HostView $v}}">{{$v}}</a></td></tr>
{{else}}
<tr><td>{{$k}}</td><td>{{$v}}</td></tr>
{{end}}
{{end}}
</table>`
}
template computation {
body = `
<p><strong>Computation</strong>
<table>
{{range .Computations}}
<tr><td><a href="{{$.Expr .Text}}">{{.Text}}</a></td><td>{{.Value}}</td></tr>
{{end}}
</table>`
}
template graph {
body = `
Graph:
{{if .Alert.Vars.graph}}
{{ .Graph .Alert.Vars.graph }}
{{end}}
`
}
template header {
body = `<p><a href="{{.Ack}}">Acknowledge alert</a>
<p><a href="{{.Rule}}">View the Rule + Template in the Bosun's Rule Page</a>
{{if .Alert.Vars.notes}}
<p>Notes: {{.Alert.Vars.notes}}
{{end}}
{{if .Alert.Vars.dashboard}}
<p><a href="{{.Alert.Vars.dashboard}}&var-colo={{.Group.colo}}&var-server={{.Group.host}}">Grafana link for {{.Alert.Vars.query}}</a>
{{end}}
`
}
##################### Alerts #######################
macro alertdefaults {
ignoreUnknown = true
template = generic
warnNotification = foo-alerts-email,foo-alerts-warning-slack
critNotification = foo-alerts-email,foo-alerts-slack
}
### XYZ COMPONENT ALERTS ###
alert foo.adaptor.write.rate.too.low {
macro = alertdefaults
$notes = Checks write rate of all adaptors and alerts on detecting that the rate is too low
$unit_string = records/minute
$query = '''aliasByNode(transformNull(sumSeriesWithWildcards(grep(prod.*.foo-adaptor-*.app.*.*.__execute-count.processMessage-*, ".*ToKafka.*"),4,7)),1,2,4)'''
$metrictags = "colo.adaptor.stream"
$metric = graphite($query, "15m", "10m", $metrictags)
$graph = graphite($query, "1h", "", $metrictags)
$dashboard = http://grafana.server.xyz.com:3000/dashboard/db/foo-uber-dashboard?
$q = avg($metric)
$num_adaptor_streams = len(t($q,""))
$num_adaptor_streams_down = len(t($q == 0, ""))
warn = $q < 100
crit = $q <= 10
critNotification = foo-service3-pagerduty
}
Which notification(s) are the ones that keep triggering after acknowledgement?
So for "foo.adaptor.write.rate.too.low" alert
, all of its critical
notifications keep triggering, at their own timeout
frequencies:
foo-service3-pagerduty (every 30 minutes) foo-alerts-email (every 2 hours) foo-alerts-slack (every 2 hours)
The original incident was a critical
(2 criticals), around more than a day ago, which was acked, fixed and closed, and which keeps re-triggering notifications.
Oh, and Bosun version?
Here's the complete log from the beginning of the alert instance, and the repeat triggers (real log, sed'ed and grepped to mask stuff) issue-1690.txt
I can concur. I'm observing the same problem on one of our Bosun instances we upgraded to 0.5.0-rc2 yesterday. E-Mails are sent for non-existing alarm. OpenTSDB version is 2.2.
@kylebrandt If I can assist in tracking this down by suppling logs or configurations, please let me know.
For now, we are resetting bosun state (i.e. clearing the redis DB and state file) every 60 minutes in a cron job.
I'm having a hard time reproducing this.
Using a trimmed down config from what you provided:
ledisDir = ../ledis_data2
checkFrequency = 30s
httpListen = :8070
template f {
subject = "aaaa"
body = "bbbb"
}
notification foo-alerts-email {
next = foo-alerts-email
timeout = 15s
print = true
}
notification foo-alerts-slack {
timeout = 25s
print = true
next = foo-alerts-slack
}
notification foo-service3-pagerduty {
timeout = 20s
print = true
next = foo-service3-pagerduty
}
alert foo {
crit = 1
template = f
critNotification = foo-alerts-email,foo-alerts-slack
critNotification = foo-service3-pagerduty
}
The notifications will print continuously at the given intervals until I ack it, and then they all stop. I can verify directly in redis that they get purged from the pending notifications bucket as soon as I ack them.
I check that with ZRANGE pendingNotifications 0 -1 WITHSCORES
in redis-cli. It should have an entry for each notification while the alert is "active", and should be an empty set after you ack it.
@captncraig I think I have some alerts currently in this state. They're ack'ed, but still keeps sending mails. Is there anything I can check to help you track down the problem?
After upgrading several of our Bosun installations including the migration of the old state file to ledis, we always observed the same behavior: For a few days we see repeated notifications for already acked or even closed alarms. This behavior fades away after a few days.
@vitorboschi, I pushed a small patch to a "stopNotifications" branch. Would you mind running that and see if it clears the problem up.
It is essentially detecting this scenario at the time notifications are sent and skipping it if it is closed or acked at that point. Not an optimal fix, but I hope it will clear up the issue.
I will try it and report back my results
I saw this when I was pointing all my bosun hosts to use localhost for redisHost. So I made changes for them, I re-configured my redis to have a master/slave (redis sentinel). Then point redis to a VIP instead of localhost. So far, so good.
@erwindwight we were not even using redis when this issue was faced. we were just using the default ledis db.
@captncraig trying out a build from stopNotifications branch. but isn't that a post-facto check that you've put in? the underlying problem might be creating more such (possibly concurrency) issues.
@captncraig The patch works and prevents resending notifications for old alarms. Do you have an idea why this happens in the first place?
I've been using the patch for some time and it looks like everything is fine. I didn't spotted any regression either
Found something promising. Looks like this occurs when you close an alert without acking it first. The old notification chain does not get cleared. Patch incoming.
We have a very simple rule file, with 3
notifications
(http post to PD and slack, and email) and a bunch ofalert
rules which trigger them. We are facing a weird issue wherein, the following happens:alert
triggers, sendsnotifications
acks
the alertinactive
closes
the alertto explain it through logs, quite literally this is what we're seeing: