elastic / elasticsearch

Free and Open Source, Distributed, RESTful Search Engine
https://www.elastic.co/products/elasticsearch
Other
1.49k stars 24.89k forks source link

Duplicate model change annotations in some QA jobs #58589

Closed przemekwitek closed 4 years ago

przemekwitek commented 4 years ago

Reported by @wwang500 : in some jobs, like art_daily_flatmiddle_1592964952_790_2715 , we are getting duplicated annotations, no by_field or partition_field

Screen Shot 2020-06-24 at 11 00 30 PM Screen Shot 2020-06-24 at 11 00 58 PM
elasticmachine commented 4 years ago

Pinging @elastic/ml-core (:ml)

przemekwitek commented 4 years ago

Here is the config of the job in question:

{
  "job_id": "art_daily_flatmiddle_1592964952_790_2715",
  "job_type": "anomaly_detector",
  "job_version": "7.9.0",
  "groups": [
    "f-metric",
    "numenta",
    "periodic"
  ],
  "description": "Numenta artificialWithAnomaly metric(value) 5m renormalization=0",
  "create_time": 1592979353139,
  "finished_time": 1592979374710,
  "analysis_config": {
    "bucket_span": "5m",
    "detectors": [
      {
        "detector_description": "metric(value)",
        "function": "metric",
        "field_name": "value",
        "detector_index": 0
      }
    ],
    "influencers": []
  },
  "analysis_limits": {
    "model_memory_limit": "10mb",
    "categorization_examples_limit": 4
  },
  "data_description": {
    "format": "delimited",
    "time_field": "timestamp",
    "time_format": "yyyy-MM-dd HH:mm:ss",
    "field_delimiter": ",",
    "quote_character": "\""
  },
  "model_plot_config": {
    "enabled": true,
    "annotations_enabled": true
  },
  "renormalization_window_days": 0,
  "model_snapshot_retention_days": 10,
  "daily_model_snapshot_retention_after_days": 1,
  "model_snapshot_id": "1592979374",
  "results_index_name": "shared",
  "allow_lazy_open": false,
  "data_counts": {
    "job_id": "art_daily_flatmiddle_1592964952_790_2715",
    "processed_record_count": 4032,
    "processed_field_count": 4032,
    "input_bytes": 217854,
    "input_field_count": 4032,
    "invalid_date_count": 0,
    "missing_field_count": 0,
    "out_of_order_timestamp_count": 0,
    "empty_bucket_count": 0,
    "sparse_bucket_count": 0,
    "bucket_count": 4031,
    "earliest_record_timestamp": 1396310400000,
    "latest_record_timestamp": 1397519700000,
    "last_data_time": 1592979365665,
    "input_record_count": 4032,
    "latest_bucket_timestamp": 1397519700000
  },
  "model_size_stats": {
    "job_id": "art_daily_flatmiddle_1592964952_790_2715",
    "result_type": "model_size_stats",
    "model_bytes": 116000,
    "model_bytes_exceeded": 0,
    "model_bytes_memory_limit": 10485760,
    "total_by_field_count": 3,
    "total_over_field_count": 0,
    "total_partition_field_count": 2,
    "bucket_allocation_failures_count": 0,
    "memory_status": "ok",
    "categorized_doc_count": 0,
    "total_category_count": 0,
    "frequent_category_count": 0,
    "rare_category_count": 0,
    "dead_category_count": 0,
    "failed_category_count": 0,
    "categorization_status": "ok",
    "log_time": 1592979374446,
    "timestamp": 1397519400000
  },
  "forecasts_stats": {
    "total": 0,
    "forecasted_jobs": 0
  },
  "state": "closed",
  "timing_stats": {
    "job_id": "art_daily_flatmiddle_1592964952_790_2715",
    "bucket_count": 4032,
    "total_bucket_processing_time_ms": 8388.000000000007,
    "minimum_bucket_processing_time_ms": 0,
    "maximum_bucket_processing_time_ms": 72,
    "average_bucket_processing_time_ms": 2.0803571428571446,
    "exponential_average_bucket_processing_time_ms": 2.74483668012071,
    "exponential_average_bucket_processing_time_per_hour_ms": 27.71019538378873
  },
  "datafeed_config": {
    "datafeed_id": "datafeed-art_daily_flatmiddle_1592964952_790_2715",
    "job_id": "art_daily_flatmiddle_1592964952_790_2715",
    "query_delay": "76549ms",
    "indices": [
      "art_daily_flatmiddle"
    ],
    "query": {
      "match_all": {}
    },
    "scroll_size": 1000,
    "chunking_config": {
      "mode": "auto"
    },
    "delayed_data_check_config": {
      "enabled": true
    },
    "indices_options": {
      "expand_wildcards": [
        "open"
      ],
      "ignore_unavailable": false,
      "allow_no_indices": true,
      "ignore_throttled": true
    },
    "state": "stopped",
    "timing_stats": {
      "job_id": "art_daily_flatmiddle_1592964952_790_2715",
      "search_count": 9,
      "bucket_count": 4031,
      "total_search_time_ms": 41,
      "average_search_time_per_bucket_ms": 0.010171173406102704,
      "exponential_average_search_time_per_hour_ms": 5.166002823361236
    }
  }
}

Indeed, no partition/by/over fields are configured and there is only 1 detector so there must be another source of duplicates.

przemekwitek commented 4 years ago

I was able to reproduce this using synthetic data. Pasting the script that generates sine wave with bigger amplitude during weekdays and smaller amplitude during weekends:

  1 from datetime import date, datetime
  2 from math import pi, sin
  3 import sys
  4 import time
  5 from random import random
  6 from elasticsearch import Elasticsearch
  7 
  8 
  9 def main(argv):
 10   if len(argv) < 1:
 11     print """Usage:
 12 python generate_duplicate_annotations.py <INDEX>
 13 """
 14     exit(1)
 15 
 16   es = Elasticsearch(['localhost'], port=9200, http_auth=('elastic', 'password'))
 17   index = argv[0]
 18 
 19   start_date = datetime(2020, 2, 3)  # It is a Monday
 20   start_timestamp = long(time.mktime(start_date.timetuple())) * 1000
 21   
 22   n = 365 * 24  # number of buckets
 23   for i in xrange(n):
 24     timestamp = start_timestamp + i * 60 * 60 * 1000
 25     a = 10
 26     if ((i / 24) % 7) in (5, 6):
 27       a = 2
 28     y = a * sin(2 * pi * i / 24.0)
 29     es.index(index=index, id=i, body={'timestamp': timestamp, 'y': y})
 30     
 31 if __name__ == "__main__":
 32    main(sys.argv[1:])

And here are the annotations with the added description (local change):

GET .ml-annotations*/_search | jq
{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 7,
      "relation": "eq"
    },
    "max_score": 1,
    "hits": [
      {
        "_index": ".ml-annotations-6",
        "_id": "momzD3MB7_gcmuQ3e0-T",
        "_score": 1,
        "_source": {
          "annotation": "Detected trend",
          "create_time": 1593696287535,
          "create_username": "_xpack",
          "timestamp": 1580742000000,
          "end_timestamp": 1580742000000,
          "job_id": "duplicate-annotations-job",
          "modified_time": 1593696287535,
          "modified_username": "_xpack",
          "type": "annotation",
          "event": "model_change",
          "detector_index": 0
        }
      },
      {
        "_index": ".ml-annotations-6",
        "_id": "oYmzD3MB7_gcmuQ3fE9r",
        "_score": 1,
        "_source": {
          "annotation": "Detected periodicity with period 1d (daily)",
          "create_time": 1593696287819,
          "create_username": "_xpack",
          "timestamp": 1580954400000,
          "end_timestamp": 1580954400000,
          "job_id": "duplicate-annotations-job",
          "modified_time": 1593696287819,
          "modified_username": "_xpack",
          "type": "annotation",
          "event": "model_change",
          "detector_index": 0
        }
      },
      {
        "_index": ".ml-annotations-6",
        "_id": "qImzD3MB7_gcmuQ3fU-C",
        "_score": 1,
        "_source": {
          "annotation": "Detected linear scale by 0.216273",
          "create_time": 1593696288105,
          "create_username": "_xpack",
          "timestamp": 1581224400000,
          "end_timestamp": 1581224400000,
          "job_id": "duplicate-annotations-job",
          "modified_time": 1593696288105,
          "modified_username": "_xpack",
          "type": "annotation",
          "event": "model_change",
          "detector_index": 0
        }
      },
      {
        "_index": ".ml-annotations-6",
        "_id": "romzD3MB7_gcmuQ3gE9D",
        "_score": 1,
        "_source": {
          "annotation": "Detected periodicity with period 1d (weekend daily)",
          "create_time": 1593696288828,
          "create_username": "_xpack",
          "timestamp": 1584324000000,
          "end_timestamp": 1584324000000,
          "job_id": "duplicate-annotations-job",
          "modified_time": 1593696288828,
          "modified_username": "_xpack",
          "type": "annotation",
          "event": "model_change",
          "detector_index": 0
        }
      },
      {
        "_index": ".ml-annotations-6",
        "_id": "r4mzD3MB7_gcmuQ3gE9D",
        "_score": 1,
        "_source": {
          "annotation": "Detected periodicity with period 1d (weekday daily)",
          "create_time": 1593696288828,
          "create_username": "_xpack",
          "timestamp": 1584324000000,
          "end_timestamp": 1584324000000,
          "job_id": "duplicate-annotations-job",
          "modified_time": 1593696288828,
          "modified_username": "_xpack",
          "type": "annotation",
          "event": "model_change",
          "detector_index": 0
        }
      },
      {
        "_index": ".ml-annotations-6",
        "_id": "sImzD3MB7_gcmuQ3gE9D",
        "_score": 1,
        "_source": {
          "annotation": "Detected periodicity with period 7d (weekend weekly)",
          "create_time": 1593696288828,
          "create_username": "_xpack",
          "timestamp": 1584324000000,
          "end_timestamp": 1584324000000,
          "job_id": "duplicate-annotations-job",
          "modified_time": 1593696288828,
          "modified_username": "_xpack",
          "type": "annotation",
          "event": "model_change",
          "detector_index": 0
        }
      },
      {
        "_index": ".ml-annotations-6",
        "_id": "sYmzD3MB7_gcmuQ3gE9D",
        "_score": 1,
        "_source": {
          "annotation": "Detected periodicity with period 7d (weekday weekly)",
          "create_time": 1593696288828,
          "create_username": "_xpack",
          "timestamp": 1584324000000,
          "end_timestamp": 1584324000000,
          "job_id": "duplicate-annotations-job",
          "modified_time": 1593696288828,
          "modified_username": "_xpack",
          "type": "annotation",
          "event": "model_change",
          "detector_index": 0
        }
      }
    ]
  }
}