medic / cht-watchdog

Configuration for deploying a monitoring/alerting stack for CHT
GNU Affero General Public License v3.0
4 stars 7 forks source link

Treat monitoring API response of `-1` as an error #93

Open mrjones-plip opened 1 year ago

mrjones-plip commented 1 year ago

Right now if we API will return -1 for a bunch of metrics when errors are thrown by the CouchDb queries to retrieve required values. However, when this happened for a production instance, date.uptime and date.current were correctly incrementing, so Watchdog didn't think anything was wrong.

Instead, we should fire an alert to indicate that something is wrong.

In the outage itself, the monitoring API returned this:

{
  "version": {
    "app": "",
    "node": "v16.20.0",
    "couchdb": ""
  },
  "couchdb": {
    "medic": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    },
    "sentinel": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    },
    "usersmeta": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    },
    "users": {
      "name": "",
      "update_sequence": -1,
      "doc_count": -1,
      "doc_del_count": -1,
      "fragmentation": -1
    }
  },
  "date": {
    "current": 1698210046488,
    "uptime": 967259.799207221
  },
  "sentinel": {
    "backlog": -1
  },
  "messaging": {
    "outgoing": {
      "total": {
        "due": -1,
        "scheduled": -1,
        "muted": -1,
        "failed": -1,
        "delivered": -1
      },
      "seven_days": {
        "due": -1,
        "scheduled": -1,
        "muted": -1,
        "failed": -1,
        "delivered": -1
      },
      "last_hundred": {
        "pending": {
          "pending": -1,
          "forwarded-to-gateway": -1,
          "received-by-gateway": -1,
          "forwarded-by-gateway": -1
        },
        "final": {
          "sent": -1,
          "delivered": -1,
          "failed": -1
        },
        "muted": {
          "denied": -1,
          "cleared": -1,
          "muted": -1,
          "duplicate": -1
        }
      }
    }
  },
  "outbound_push": {
    "backlog": -1
  },
  "feedback": {
    "count": -1
  },
  "conflict": {
    "count": -1
  },
  "replication_limit": {
    "count": -1
  },
  "connected_users": {
    "count": -1
  }
}

which in turn looked like 0 value for everything instead of -1

image