Closed przemekwitek closed 4 years ago
Pinging @elastic/ml-core (:ml)
Here is the config of the job in question:
{
"job_id": "art_daily_flatmiddle_1592964952_790_2715",
"job_type": "anomaly_detector",
"job_version": "7.9.0",
"groups": [
"f-metric",
"numenta",
"periodic"
],
"description": "Numenta artificialWithAnomaly metric(value) 5m renormalization=0",
"create_time": 1592979353139,
"finished_time": 1592979374710,
"analysis_config": {
"bucket_span": "5m",
"detectors": [
{
"detector_description": "metric(value)",
"function": "metric",
"field_name": "value",
"detector_index": 0
}
],
"influencers": []
},
"analysis_limits": {
"model_memory_limit": "10mb",
"categorization_examples_limit": 4
},
"data_description": {
"format": "delimited",
"time_field": "timestamp",
"time_format": "yyyy-MM-dd HH:mm:ss",
"field_delimiter": ",",
"quote_character": "\""
},
"model_plot_config": {
"enabled": true,
"annotations_enabled": true
},
"renormalization_window_days": 0,
"model_snapshot_retention_days": 10,
"daily_model_snapshot_retention_after_days": 1,
"model_snapshot_id": "1592979374",
"results_index_name": "shared",
"allow_lazy_open": false,
"data_counts": {
"job_id": "art_daily_flatmiddle_1592964952_790_2715",
"processed_record_count": 4032,
"processed_field_count": 4032,
"input_bytes": 217854,
"input_field_count": 4032,
"invalid_date_count": 0,
"missing_field_count": 0,
"out_of_order_timestamp_count": 0,
"empty_bucket_count": 0,
"sparse_bucket_count": 0,
"bucket_count": 4031,
"earliest_record_timestamp": 1396310400000,
"latest_record_timestamp": 1397519700000,
"last_data_time": 1592979365665,
"input_record_count": 4032,
"latest_bucket_timestamp": 1397519700000
},
"model_size_stats": {
"job_id": "art_daily_flatmiddle_1592964952_790_2715",
"result_type": "model_size_stats",
"model_bytes": 116000,
"model_bytes_exceeded": 0,
"model_bytes_memory_limit": 10485760,
"total_by_field_count": 3,
"total_over_field_count": 0,
"total_partition_field_count": 2,
"bucket_allocation_failures_count": 0,
"memory_status": "ok",
"categorized_doc_count": 0,
"total_category_count": 0,
"frequent_category_count": 0,
"rare_category_count": 0,
"dead_category_count": 0,
"failed_category_count": 0,
"categorization_status": "ok",
"log_time": 1592979374446,
"timestamp": 1397519400000
},
"forecasts_stats": {
"total": 0,
"forecasted_jobs": 0
},
"state": "closed",
"timing_stats": {
"job_id": "art_daily_flatmiddle_1592964952_790_2715",
"bucket_count": 4032,
"total_bucket_processing_time_ms": 8388.000000000007,
"minimum_bucket_processing_time_ms": 0,
"maximum_bucket_processing_time_ms": 72,
"average_bucket_processing_time_ms": 2.0803571428571446,
"exponential_average_bucket_processing_time_ms": 2.74483668012071,
"exponential_average_bucket_processing_time_per_hour_ms": 27.71019538378873
},
"datafeed_config": {
"datafeed_id": "datafeed-art_daily_flatmiddle_1592964952_790_2715",
"job_id": "art_daily_flatmiddle_1592964952_790_2715",
"query_delay": "76549ms",
"indices": [
"art_daily_flatmiddle"
],
"query": {
"match_all": {}
},
"scroll_size": 1000,
"chunking_config": {
"mode": "auto"
},
"delayed_data_check_config": {
"enabled": true
},
"indices_options": {
"expand_wildcards": [
"open"
],
"ignore_unavailable": false,
"allow_no_indices": true,
"ignore_throttled": true
},
"state": "stopped",
"timing_stats": {
"job_id": "art_daily_flatmiddle_1592964952_790_2715",
"search_count": 9,
"bucket_count": 4031,
"total_search_time_ms": 41,
"average_search_time_per_bucket_ms": 0.010171173406102704,
"exponential_average_search_time_per_hour_ms": 5.166002823361236
}
}
}
Indeed, no partition/by/over fields are configured and there is only 1 detector so there must be another source of duplicates.
I was able to reproduce this using synthetic data. Pasting the script that generates sine wave with bigger amplitude during weekdays and smaller amplitude during weekends:
1 from datetime import date, datetime
2 from math import pi, sin
3 import sys
4 import time
5 from random import random
6 from elasticsearch import Elasticsearch
7
8
9 def main(argv):
10 if len(argv) < 1:
11 print """Usage:
12 python generate_duplicate_annotations.py <INDEX>
13 """
14 exit(1)
15
16 es = Elasticsearch(['localhost'], port=9200, http_auth=('elastic', 'password'))
17 index = argv[0]
18
19 start_date = datetime(2020, 2, 3) # It is a Monday
20 start_timestamp = long(time.mktime(start_date.timetuple())) * 1000
21
22 n = 365 * 24 # number of buckets
23 for i in xrange(n):
24 timestamp = start_timestamp + i * 60 * 60 * 1000
25 a = 10
26 if ((i / 24) % 7) in (5, 6):
27 a = 2
28 y = a * sin(2 * pi * i / 24.0)
29 es.index(index=index, id=i, body={'timestamp': timestamp, 'y': y})
30
31 if __name__ == "__main__":
32 main(sys.argv[1:])
And here are the annotations with the added description (local change):
GET .ml-annotations*/_search | jq
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 7,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": ".ml-annotations-6",
"_id": "momzD3MB7_gcmuQ3e0-T",
"_score": 1,
"_source": {
"annotation": "Detected trend",
"create_time": 1593696287535,
"create_username": "_xpack",
"timestamp": 1580742000000,
"end_timestamp": 1580742000000,
"job_id": "duplicate-annotations-job",
"modified_time": 1593696287535,
"modified_username": "_xpack",
"type": "annotation",
"event": "model_change",
"detector_index": 0
}
},
{
"_index": ".ml-annotations-6",
"_id": "oYmzD3MB7_gcmuQ3fE9r",
"_score": 1,
"_source": {
"annotation": "Detected periodicity with period 1d (daily)",
"create_time": 1593696287819,
"create_username": "_xpack",
"timestamp": 1580954400000,
"end_timestamp": 1580954400000,
"job_id": "duplicate-annotations-job",
"modified_time": 1593696287819,
"modified_username": "_xpack",
"type": "annotation",
"event": "model_change",
"detector_index": 0
}
},
{
"_index": ".ml-annotations-6",
"_id": "qImzD3MB7_gcmuQ3fU-C",
"_score": 1,
"_source": {
"annotation": "Detected linear scale by 0.216273",
"create_time": 1593696288105,
"create_username": "_xpack",
"timestamp": 1581224400000,
"end_timestamp": 1581224400000,
"job_id": "duplicate-annotations-job",
"modified_time": 1593696288105,
"modified_username": "_xpack",
"type": "annotation",
"event": "model_change",
"detector_index": 0
}
},
{
"_index": ".ml-annotations-6",
"_id": "romzD3MB7_gcmuQ3gE9D",
"_score": 1,
"_source": {
"annotation": "Detected periodicity with period 1d (weekend daily)",
"create_time": 1593696288828,
"create_username": "_xpack",
"timestamp": 1584324000000,
"end_timestamp": 1584324000000,
"job_id": "duplicate-annotations-job",
"modified_time": 1593696288828,
"modified_username": "_xpack",
"type": "annotation",
"event": "model_change",
"detector_index": 0
}
},
{
"_index": ".ml-annotations-6",
"_id": "r4mzD3MB7_gcmuQ3gE9D",
"_score": 1,
"_source": {
"annotation": "Detected periodicity with period 1d (weekday daily)",
"create_time": 1593696288828,
"create_username": "_xpack",
"timestamp": 1584324000000,
"end_timestamp": 1584324000000,
"job_id": "duplicate-annotations-job",
"modified_time": 1593696288828,
"modified_username": "_xpack",
"type": "annotation",
"event": "model_change",
"detector_index": 0
}
},
{
"_index": ".ml-annotations-6",
"_id": "sImzD3MB7_gcmuQ3gE9D",
"_score": 1,
"_source": {
"annotation": "Detected periodicity with period 7d (weekend weekly)",
"create_time": 1593696288828,
"create_username": "_xpack",
"timestamp": 1584324000000,
"end_timestamp": 1584324000000,
"job_id": "duplicate-annotations-job",
"modified_time": 1593696288828,
"modified_username": "_xpack",
"type": "annotation",
"event": "model_change",
"detector_index": 0
}
},
{
"_index": ".ml-annotations-6",
"_id": "sYmzD3MB7_gcmuQ3gE9D",
"_score": 1,
"_source": {
"annotation": "Detected periodicity with period 7d (weekday weekly)",
"create_time": 1593696288828,
"create_username": "_xpack",
"timestamp": 1584324000000,
"end_timestamp": 1584324000000,
"job_id": "duplicate-annotations-job",
"modified_time": 1593696288828,
"modified_username": "_xpack",
"type": "annotation",
"event": "model_change",
"detector_index": 0
}
}
]
}
}
Reported by @wwang500 : in some jobs, like art_daily_flatmiddle_1592964952_790_2715 , we are getting duplicated annotations, no by_field or partition_field