elastic / ml-cpp

Machine learning C++ code
Other
7 stars 62 forks source link

[ML] high_count categorization anomaly detection job gives anomalies for actual count of zero #2610

Closed peteharverson closed 8 months ago

peteharverson commented 10 months ago

An anomaly detection job running high_count by mlcategory is producing anomalies with an actual count of zero compared to non-zero typical counts. It seems counterintuitive that a high_count job is giving anomalies where the actual count is lower than the typical count.

Used it_ops_2019 data set.

Job config:

{
  "job_id": "it_ops_app_logs_high_count",
  "datafeed_config": {
    "datafeed_id": "datafeed-it_ops_app_logs_high_count",
    "job_id": "it_ops_app_logs_high_count",
    "authorization": {
      "roles": [
        "superuser"
      ]
    },
    "query_delay": "78988ms",
    "chunking_config": {
      "mode": "auto"
    },
    "indices_options": {
      "expand_wildcards": [
        "open"
      ],
      "ignore_unavailable": false,
      "allow_no_indices": true,
      "ignore_throttled": true
    },
    "query": {
      "bool": {
        "must": [
          {
            "match_all": {}
          }
        ]
      }
    },
    "indices": [
      "it_ops_app_logs-2019"
    ],
    "scroll_size": 1000,
    "delayed_data_check_config": {
      "enabled": true
    }
  },
  "description": "",
  "analysis_config": {
    "bucket_span": "15m",
    "categorization_field_name": "message",
    "categorization_analyzer": {
      "char_filter": [
        "first_line_with_letters"
      ],
      "tokenizer": "ml_standard",
      "filter": [
        {
          "type": "stop",
          "stopwords": [
            "Monday",
            "Tuesday",
            "Wednesday",
            "Thursday",
            "Friday",
            "Saturday",
            "Sunday",
            "Mon",
            "Tue",
            "Wed",
            "Thu",
            "Fri",
            "Sat",
            "Sun",
            "January",
            "February",
            "March",
            "April",
            "May",
            "June",
            "July",
            "August",
            "September",
            "October",
            "November",
            "December",
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
            "GMT",
            "UTC"
          ]
        },
        {
          "type": "limit",
          "max_token_count": "100"
        }
      ]
    },
    "per_partition_categorization": {
      "enabled": false
    },
    "detectors": [
      {
        "detector_description": "high_count by mlcategory",
        "function": "high_count",
        "by_field_name": "mlcategory",
        "detector_index": 0
      }
    ],
    "influencers": [
      "mlcategory"
    ],
    "model_prune_window": "30d"
  },
  "analysis_limits": {
    "model_memory_limit": "61mb",
    "categorization_examples_limit": 4
  },
  "data_description": {
    "time_field": "@timestamp",
    "time_format": "epoch_ms"
  },
  "model_plot_config": {
    "enabled": false,
    "annotations_enabled": false
  }
}

Anomalies:

image
peteharverson commented 8 months ago

@tveasey following up from our discussion yesterday - I reran this job config and confirmed that all the anomalies with an actual of 0 are multi-bucket anomalies which trail a spike.

So looks like this is working as expected - with the anomalies in the trailing buckets with actuals of 0 explained by the fact that the 12 buckets taken together are anomalously high.

Are we good to close this issue @tveasey ?

Screenshot 2024-03-06 at 11 25 20
peteharverson commented 8 months ago

Closing as no changes needed here.