Closed danielkasen closed 6 years ago
@danielkasen Can you share with us the recovery stats? You can get it by GET /_recovery
. Thanks.
Pinging @elastic/es-distributed
"test-2018.08.16.20.15-1": {
"shards": [
{
"id": 24,
"type": "PEER",
"stage": "TRANSLOG",
"primary": false,
"start_time": "2018-08-16T23:01:47.323Z",
"start_time_in_millis": 1534460507323,
"total_time": "54.3m",
"total_time_in_millis": 3259899,
"source": {
"id": "Z2zhHgzsRduWQh5TAgXYOA",
"host": "ip-10-0-77.41.ap-southeast-1.compute.internal",
"transport_address": "10.0.77.41:9300",
"ip": "10.0.77.41",
"name": "i-04e5e9a72be7b6d3e"
},
"target": {
"id": "5mhfEJsASVyTBrRslIOOpQ",
"host": "ip-10-0-59-250.ap-southeast-1.compute.internal",
"transport_address": "10.0.59.250:9300",
"ip": "10.0.59.250",
"name": "i-0dc683029ed922ab6"
},
"index": {
"size": {
"total": "33.6gb",
"total_in_bytes": 36097975075,
"reused": "0b",
"reused_in_bytes": 0,
"recovered": "33.6gb",
"recovered_in_bytes": 36097975075,
"percent": "100.0%"
},
"files": {
"total": 243,
"reused": 0,
"recovered": 243,
"percent": "100.0%"
},
"total_time": "23.2m",
"total_time_in_millis": 1395463,
"source_throttle_time": "0s",
"source_throttle_time_in_millis": 0,
"target_throttle_time": "0s",
"target_throttle_time_in_millis": 0
},
"translog": {
"recovered": 4797934,
"total": 29997597,
"percent": "16.0%",
"total_on_start": 1972667,
"total_time": "31m",
"total_time_in_millis": 1864433
},
"verify_index": {
"check_index_time": "0s",
"check_index_time_in_millis": 0,
"total_time": "0s",
"total_time_in_millis": 0
}
}
}```
"translog": {
"recovered": 4797934,
"total": 29997597,
"percent": "16.0%",
"total_on_start": 1972667,
"total_time": "31m",
"total_time_in_millis": 1864433
}
@danielkasen There might be something wrong with your index. Can you share with us the shard-level stats of the offending shard via GET _stats?level=shards
? Thank you!
@danielkasen Was this index created on v6.3.2 or before?
I trimmed it up a little:
"primaries": {
"docs": {
"count": 4696689817,
"deleted": 0
},
"store": {
"size_in_bytes": 2507392614774
},
"indexing": {
"index_total": 4693250300,
"index_time_in_millis": 1482198324,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 3624
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 19055,
"query_time_in_millis": 771671,
"query_current": 0,
"fetch_total": 6868,
"fetch_time_in_millis": 99170,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 558008,
"total_time_in_millis": 1144008437,
"total_docs": 18842441816,
"total_size_in_bytes": 11946457504271,
"total_stopped_time_in_millis": 663578,
"total_throttled_time_in_millis": 539275685,
"total_auto_throttle_in_bytes": 2474063346
},
"refresh": {
"total": 476923,
"total_time_in_millis": 43048206,
"listeners": 0
},
"flush": {
"total": 1931,
"periodic": 1530,
"total_time_in_millis": 1259204
},
"warmer": {
"current": 0,
"total": 474858,
"total_time_in_millis": 13279
},
"query_cache": {
"memory_size_in_bytes": 551969373,
"total_count": 3945739,
"hit_count": 60815,
"miss_count": 3884924,
"cache_size": 2509,
"cache_count": 4280,
"evictions": 1771
},
"fielddata": {
"memory_size_in_bytes": 181832,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 1646,
"memory_in_bytes": 6838966702,
"terms_memory_in_bytes": 6566645884,
"stored_fields_memory_in_bytes": 192830040,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 432256,
"points_memory_in_bytes": 78520378,
"doc_values_memory_in_bytes": 538144,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": 1534460344075,
"file_sizes": {}
},
"translog": {
"operations": 475867744,
"size_in_bytes": 357659107882,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 2475,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 41324,
"evictions": 0,
"hit_count": 0,
"miss_count": 149
},
"recovery": {
"current_as_source": 13,
"current_as_target": 0,
"throttle_time_in_millis": 199
}
},
"total": {
"docs": {
"count": 12733248842,
"deleted": 0
},
"store": {
"size_in_bytes": 6801037835462
},
"indexing": {
"index_total": 12182770492,
"index_time_in_millis": 3681619914,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 8217
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 48636,
"query_time_in_millis": 1845896,
"query_current": 0,
"fetch_total": 17127,
"fetch_time_in_millis": 247028,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 1409139,
"total_time_in_millis": 2886915643,
"total_docs": 49035135361,
"total_size_in_bytes": 31102491157001,
"total_stopped_time_in_millis": 1392859,
"total_throttled_time_in_millis": 1347019754,
"total_auto_throttle_in_bytes": 6533586445
},
"refresh": {
"total": 1150880,
"total_time_in_millis": 104212382,
"listeners": 0
},
"flush": {
"total": 4870,
"periodic": 3896,
"total_time_in_millis": 3353823
},
"warmer": {
"current": 0,
"total": 1145688,
"total_time_in_millis": 32127
},
"query_cache": {
"memory_size_in_bytes": 1508601756,
"total_count": 10708084,
"hit_count": 179335,
"miss_count": 10528749,
"cache_size": 8415,
"cache_count": 15144,
"evictions": 6729
},
"fielddata": {
"memory_size_in_bytes": 548968,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 4429,
"memory_in_bytes": 18519594250,
"terms_memory_in_bytes": 17781776647,
"stored_fields_memory_in_bytes": 522397080,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 1163328,
"points_memory_in_bytes": 212888975,
"doc_values_memory_in_bytes": 1368220,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": 1534460344075,
"file_sizes": {}
},
"translog": {
"operations": 532470023,
"size_in_bytes": 400610878412,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 6710,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 107235,
"evictions": 0,
"hit_count": 0,
"miss_count": 363
},
"recovery": {
"current_as_source": 13,
"current_as_target": 0,
"throttle_time_in_millis": 199
}
}
And the shard in question:
``` "24": [
{
"routing": {
"state": "STARTED",
"primary": true,
"node": "Z2zhHgzsRduWQh5TAgXYOA",
"relocating_node": null
},
"docs": {
"count": 104377292,
"deleted": 0
},
"store": {
"size_in_bytes": 55763854612
},
"indexing": {
"index_total": 102655515,
"index_time_in_millis": 28984607,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 462,
"query_time_in_millis": 22043,
"query_current": 0,
"fetch_total": 111,
"fetch_time_in_millis": 2056,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 12166,
"total_time_in_millis": 24614271,
"total_docs": 408012497,
"total_size_in_bytes": 259381876235,
"total_stopped_time_in_millis": 1129,
"total_throttled_time_in_millis": 12163358,
"total_auto_throttle_in_bytes": 47623853
},
"refresh": {
"total": 9979,
"total_time_in_millis": 879001,
"listeners": 0
},
"flush": {
"total": 41,
"periodic": 34,
"total_time_in_millis": 23972
},
"warmer": {
"current": 0,
"total": 9937,
"total_time_in_millis": 318
},
"query_cache": {
"memory_size_in_bytes": 11974438,
"total_count": 126229,
"hit_count": 1274,
"miss_count": 124955,
"cache_size": 174,
"cache_count": 177,
"evictions": 3
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 40,
"memory_in_bytes": 153351228,
"terms_memory_in_bytes": 147301823,
"stored_fields_memory_in_bytes": 4282400,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 10560,
"points_memory_in_bytes": 1747621,
"doc_values_memory_in_bytes": 8824,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": 1534460340053,
"file_sizes": {}
},
"translog": {
"operations": 36053875,
"size_in_bytes": 27082560372,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 7618269
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 0,
"miss_count": 1
},
"recovery": {
"current_as_source": 1,
"current_as_target": 0,
"throttle_time_in_millis": 0
},
"commit": {
"id": "NgHyOieMnnjOv1Hw6PFoGw==",
"generation": 48,
"user_data": {
"local_checkpoint": "104377442",
"max_unsafe_auto_id_timestamp": "1534460340053",
"translog_uuid": "Kiaim6eQSdSO9cPzGJYMbQ",
"history_uuid": "K7_gT4mnSeGZ0R3o4irRPw",
"sync_id": "zSY1kxZKTA69fa3lY5mIbQ",
"translog_generation": "1170",
"max_seq_no": "104377442"
},
"num_docs": 104377292
},
"seq_no": {
"max_seq_no": 104377442,
"local_checkpoint": 104377442,
"global_checkpoint": 104377442
},
"shard_path": {
"state_path": "/storage/elasticsearch/nodes/0",
"data_path": "/storage/elasticsearch/nodes/0",
"is_custom_data_path": false
}
},
{
"routing": {
"state": "STARTED",
"primary": false,
"node": "IytAnZXSRe-Np7X6frSnRg",
"relocating_node": null
},
"docs": {
"count": 104377292,
"deleted": 0
},
"store": {
"size_in_bytes": 55808763196
},
"indexing": {
"index_total": 104378207,
"index_time_in_millis": 29069902,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 509,
"query_time_in_millis": 14858,
"query_current": 0,
"fetch_total": 138,
"fetch_time_in_millis": 2207,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 12982,
"total_time_in_millis": 25564819,
"total_docs": 419646085,
"total_size_in_bytes": 266669436195,
"total_stopped_time_in_millis": 18403,
"total_throttled_time_in_millis": 12422632,
"total_auto_throttle_in_bytes": 43655199
},
"refresh": {
"total": 10690,
"total_time_in_millis": 898581,
"listeners": 0
},
"flush": {
"total": 43,
"periodic": 34,
"total_time_in_millis": 22349
},
"warmer": {
"current": 0,
"total": 10643,
"total_time_in_millis": 274
},
"query_cache": {
"memory_size_in_bytes": 12160499,
"total_count": 89348,
"hit_count": 1488,
"miss_count": 87860,
"cache_size": 100,
"cache_count": 105,
"evictions": 5
},
"fielddata": {
"memory_size_in_bytes": 6800,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 38,
"memory_in_bytes": 152663986,
"terms_memory_in_bytes": 146617163,
"stored_fields_memory_in_bytes": 4275584,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 10048,
"points_memory_in_bytes": 1747951,
"doc_values_memory_in_bytes": 13240,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": 1534460340053,
"file_sizes": {}
},
"translog": {
"operations": 726048,
"size_in_bytes": 551815741,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 5854682
},
"request_cache": {
"memory_size_in_bytes": 2383,
"evictions": 0,
"hit_count": 0,
"miss_count": 6
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
},
"commit": {
"id": "X9ulEr6/ReWO/YHF6G1skA==",
"generation": 47,
"user_data": {
"local_checkpoint": "104377442",
"max_unsafe_auto_id_timestamp": "1534460340053",
"translog_uuid": "VkIAw8zxRXuHG40LnxDcyw",
"history_uuid": "K7_gT4mnSeGZ0R3o4irRPw",
"sync_id": "zSY1kxZKTA69fa3lY5mIbQ",
"translog_generation": "1172",
"max_seq_no": "104377442"
},
"num_docs": 104377292
},
"seq_no": {
"max_seq_no": 104377442,
"local_checkpoint": 104377442,
"global_checkpoint": 104377442
},
"shard_path": {
"state_path": "/storage/elasticsearch/nodes/0",
"data_path": "/storage/elasticsearch/nodes/0",
"is_custom_data_path": false
}
}
],
@danielkasen Thanks for the stats.
"translog": {
"operations": 36053875,
"size_in_bytes": 27082560372,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 7618269
}
The primary shard is keeping 25GB translog. Did you change the translog retention settings (index.translog.retention.size or index.translog.retention.age)?
yes, we have a rather large throughput on the index and it can generate much more then the default 512mb. Otherwise the shard has to completely rebuild when it the node comes back online. I'm ok with having this extra space during this time, but the logs get copied over slowly. Here is the index setting.
"flush_threshold_size": "2048mb",
"sync_interval": "5s",
"retention": {
"size": "50gb",
"age": "30m"
},
"durability": "async"
Could we see the output of
GET /_nodes/_local/hot_threads?threads=1000&ignore_idle_threads=false
on the source node, during the translog phase of such a recovery?
@danielkasen The problem is that your translog retention is too high: "retention": { "size": "50gb"}
. The recovery process has to "replay" 30 million operations for the shard[24]. Replaying operations is much more than just copying over the wire.
we have a rather large throughput on the index and it can generate much more then the default 512mb. Otherwise the shard has to completely rebuild when it the node comes back online
This change is not necessary. If a node comeback and the primary does not have the required translog for the sequence-based recovery, we fall back to the file-based recovery. Note that the shard[24] is executing a file-based recovery even with this setting.
I would recommend reducing the retention setting. I am closing this as we use Github for bug reports and feature requests. Feel free to continue the discussion at https://discuss.elastic.co. Thank you!
Sure, but doesn't this then require a completely new copy of the shard to be transferred and then the new translogs need to catch up anyway? With a 50GB shard I usually have to wait 40+ minutes for the recovery to happen and am trying to figure out a way to speed this up by just needing to apply the translogs (figured that was the point of them).
This also doesn't explain why the translog copy process is so slow. Feels like the transport layer could be copying them in parallel.
I ran into a similar though put issue (assuming that your 8Mbs is actually 8MB/s) on recovery no matter what I set the max_bytes_per_sec (including -1). Turns out I had the transport.tcp.compress set to true which was the throttle. Changing that to false made an instant difference and now the bottleneck is either the max_bytes_per_sec setting or the underlying hardware infrastructure.
I think this warrants some further investigation, although I will reiterate @dnhatn's recommendation to reset the translog retention settings back to their defaults. Ops-based recoveries are quicker because they can sometimes avoid reading and transmitting too much data rather than because they are inherently faster: in fact replaying operations from the translog is quite a bit more expensive than simply copying the segment files over, so there's definitely a point at which a replica has fallen so far behind that it's cheaper to rebuild it from scratch. Replaying the translog is much more than a "copy process". In addition, retaining such a large translog violates some assumptions about things like how far out of order its contents can be, and this affects how things are tuned internally.
Please could you re-run these experiments with default translog retention settings and report back? If this is in a test cluster then you could also set logger.org.elasticsearch.indices.recovery: TRACE
so you can see how long some of the internal steps are taking.
@thomas-tomlinson I'm not convinced this is related to transport.tcp.compress
(this option is off by default and not shown in the settings in the OP) but it is worth noting that throttling takes place before compression.
So, sorry I missed these responses. But I would like to point out I have tcp compression on. It's not a dynamic setting so it's in the elasticsearch.yml file.
Elasticsearch version (
bin/elasticsearch --version
): 6.3.2Plugins installed: []
JVM version (
java -version
): openJDK 8OS version (
uname -a
if on a Unix-like system): Ubuntu 14.04Description of the problem including expected versus actual behavior: Translog during recovery is extremely slow. Watching the network traffic between the primary and the replica catching up the translog is only shipped at about 8mbs. Testing shows that traffic between these hosts can top 300Mbs and no other activity is occuring during this time. Setting the max_bytes_per_sec to a high level doesn't have any effect either.
max_bytes_per_sec": "2000mb"
Full Cluster Settings: