alertmanager / alert_manager

Splunk Alert Manager with advanced reporting on alerts, workflows (modify assignee, status, severity) and auto-resolve features
Other
81 stars 44 forks source link

Unhandled Exceptions in alert_manager_scheduler.py #141

Closed johnfromthefuture closed 8 years ago

johnfromthefuture commented 8 years ago

So the short story is that when the scheduler checked for alerts/incidents that needed auto-resolving, the script would do this:

json.loads(stuff)

An unhandled exception is thrown if no results are returned or something goes wrong. As a result, not all incidents meant to be auto-resolved are auto-resolved. I included the file below to show where I added additional simple try/except blocks.

I believe this solves my issue.

import os
import sys
import urllib
import json
import splunk
import splunk.rest as rest
import splunk.input as input
import splunk.entity as entity
import time
import logging
import logging.handlers
import hashlib
import datetime
import socket

dir = os.path.join(os.path.join(os.environ.get('SPLUNK_HOME')), 'etc', 'apps', 'alert_manager', 'bin', 'lib')
if not dir in sys.path:
    sys.path.append(dir)

from EventHandler import *
from IncidentContext import *
from SuppressionHelper import *

#sys.stdout = open('/tmp/stdout', 'w')
#sys.stderr = open('/tmp/stderr', 'w')

start = time.time()

# Setup logger
log = logging.getLogger('alert_manager_scheduler')
fh     = logging.handlers.RotatingFileHandler(os.environ.get('SPLUNK_HOME') + "/var/log/splunk/alert_manager_scheduler.log", maxBytes=25000000, backupCount=5)
formatter = logging.Formatter("%(asctime)-15s %(levelname)-5s %(message)s")
fh.setFormatter(formatter)
log.addHandler(fh)
log.setLevel(logging.INFO)

sessionKey     = sys.stdin.readline().strip()
splunk.setDefault('sessionKey', sessionKey)

eh = EventHandler(sessionKey=sessionKey)
sh = SuppressionHelper(sessionKey=sessionKey)
#sessionKey     = urllib.unquote(sessionKey[11:]).decode('utf8')

log.debug("Scheduler started. sessionKey=%s" % sessionKey)

#
# Get global settings
#
config = {}
config['index'] = 'alerts'

restconfig = entity.getEntities('configs/alert_manager', count=-1, sessionKey=sessionKey)
if len(restconfig) > 0:
    if 'index' in restconfig['settings']:
        config['index'] = restconfig['settings']['index']

log.debug("Global settings: %s" % config)

# Look for auto_ttl_resolve incidents
uri = '/services/saved/searches?output_mode=json'
serverResponse, serverContent = rest.simpleRequest(uri, sessionKey=sessionKey)
try:
    alerts = json.loads(urllib.unquote(serverContent))
except:
    log.info('When searching for alerts, none were returned. This should never happen.')
    alerts = []

if len(alerts) > 0 and 'entry' in alerts:
    for alert in alerts['entry']:
        if 'content' in alert and 'action.alert_manager' in alert['content'] and alert['content']['action.alert_manager'] == "1" and 'action.alert_manager.param.auto_ttl_resove' in alert['content'] and alert['content']['action.alert_manager.param.auto_ttl_resove'] == "1":
            log.debug('Attempting to query incidents for alert='+str(alert['name'].encode('utf8').replace('/','%2F')) )
            query_incidents = '{  "alert": "'+  alert['name'].encode('utf8').replace('/','%2F')+'", "$or": [ { "status": "auto_assigned" } , { "status": "new" } ] }'
            uri = '/servicesNS/nobody/alert_manager/storage/collections/data/incidents?query=%s' % urllib.quote(query_incidents)
            serverResponseIncidents, serverContentIncidents = rest.simpleRequest(uri, sessionKey=sessionKey)
            try:
                incidents = json.loads(serverContentIncidents)
            except:
                log.info('Error loading results or no results returned from server for alert='+str(alert['name'].encode('utf8').replace('/','%2F'))+'. Skipping...')
                incidents = []

            if len(incidents) > 0:
                log.info("Found %s incidents of alert %s to check for reached ttl..." % (len(incidents), alert['name']))
                for incident in incidents:
                    log.info("Checking incident: %s" % incident['incident_id'])
                    if (incident['alert_time'] + incident['ttl']) <= time.time():
                        log.info("Incident %s (%s) should be resolved. alert_time=%s ttl=%s now=%s" % (incident['incident_id'], incident['_key'], incident['alert_time'], incident['ttl'], time.time()))
                        old_status = incident['status']
                        incident['status'] = 'auto_ttl_resolved'
                        uri = '/servicesNS/nobody/alert_manager/storage/collections/data/incidents/%s' % incident['_key']
                        incidentStr = json.dumps(incident)
                        serverResponse, serverContent = rest.simpleRequest(uri, sessionKey=sessionKey, jsonargs=incidentStr)

                        now = datetime.datetime.now().isoformat()
                        event_id = hashlib.md5(incident['incident_id'] + now).hexdigest()
                        log.debug("event_id=%s now=%s" % (event_id, now))

                        event = 'time=%s severity=INFO origin="alert_manager_scheduler" event_id="%s" user="splunk-system-user" action="auto_ttl_resolve" previous_status="%s" status="auto_ttl_resolved" incident_id="%s"' % (now, event_id, old_status, incident['incident_id'])
                        log.debug("Event will be: %s" % event)
                        input.submit(event, hostname = socket.gethostname(), sourcetype = 'incident_change', source = 'alert_manager_scheduler.py', index = config['index'])
                        ic = IncidentContext(sessionKey, incident["incident_id"])
                        eh.handleEvent(alert=alert["name"], event="incident_auto_ttl_resolved", incident={"owner": incident["owner"]}, context=ic.getContext())
                    else:
                        log.info("Incident %s has not ttl reached yet." % incident['incident_id'])
            else:
                log.info("No incidents of alert %s to check for reached ttl." % alert['name'])
        log.debug('Alert "%s" is not configured for auto_ttl_resolve, skipping...' % alert['name'])

# Look for auto_suppress_resolve incidents
query = {}
query['auto_suppress_resolve'] = True
log.debug("Filter: %s" % json.dumps(query))
uri = '/servicesNS/nobody/alert_manager/storage/collections/data/incident_settings?query=%s' % urllib.quote(json.dumps(query))
serverResponse, serverContent = rest.simpleRequest(uri, sessionKey=sessionKey)
try:
    alerts = json.loads(serverContent)
except:
    log.info('An error happened or no incidents were found with auto_suppress_resolve set to True.')
    alerts=[]

if len(alerts) >0:
    for alert in alerts:
        query_incidents = '{  "alert": "'+str(alert['alert'].encode('utf8').replace('/','%2F'))+'", "$or": [ { "status": "auto_assigned" } , { "status": "new" } ] }'
        uri = '/servicesNS/nobody/alert_manager/storage/collections/data/incidents?query=%s' % urllib.quote(query_incidents)
        serverResponseIncidents, serverContentIncidents = rest.simpleRequest(uri, sessionKey=sessionKey)

        try:
            incidents = json.loads(serverContentIncidents)
        except:
            log.info('An error happened or no incidents were found for alert=' + str(alert['name'].encode('utf8').replace('/','%2F')) + '. Continuing.')
            incidents = []

        if len(incidents) > 0:
            log.info("Found %s incidents of alert %s to check for suppression..." % (len(incidents), alert['alert']))
            for incident in incidents:
                log.info("Checking incident: %s" % incident['incident_id'])
                ic = IncidentContext(sessionKey, incident["incident_id"])
                context = ic.getContext()
                incident_suppressed, rule_names = sh.checkSuppression(alert['alert'], context)
                if incident_suppressed == True:
                    log.info("Incident %s (%s) should be resolved. alert_time=%s since suppression was successful." % (incident['incident_id'], incident['_key'], incident['alert_time']))
                    old_status = incident['status']
                    incident['status'] = 'auto_suppress_resolved'
                    uri = '/servicesNS/nobody/alert_manager/storage/collections/data/incidents/%s' % incident['_key']
                    incidentStr = json.dumps(incident)
                    serverResponse, serverContent = rest.simpleRequest(uri, sessionKey=sessionKey, jsonargs=incidentStr)

                    now = datetime.datetime.now().isoformat()
                    event_id = hashlib.md5(incident['incident_id'] + now).hexdigest()
                    log.debug("event_id=%s now=%s" % (event_id, now))

                    rules = ' '.join(['suppression_rule="'+ rule_name +'"' for  rule_name in rule_names])
                    event = 'time=%s severity=INFO origin="alert_manager_scheduler" event_id="%s" user="splunk-system-user" action="auto_suppress_resolve" previous_status="%s" status="auto_suppress_resolved" incident_id="%s" %s' % (now, event_id, old_status, incident['incident_id'], rules)
                    input.submit(event, hostname = socket.gethostname(), sourcetype = 'incident_change', source = 'alert_manager_scheduler.py', index = config['index'])

                    eh.handleEvent(alert=alert['alert'], event="incident_auto_suppress_resolved", incident={"owner": incident['owner']}, context=context)

else:
    log.info("No alert found where auto_suppress_resolve is active.")

end = time.time()
duration = round((end-start), 3)
log.info("Alert manager scheduler finished. duration=%ss" % duration)
simcen commented 8 years ago

Fixed in 899bed0. Thx for the report