flant / curator-opensearch

Curator for OpenSearch.
Other
76 stars 28 forks source link

delete_snapshot intermittently fails with `snapshot is in state "IN_PROGRESS"` #10

Open eherot opened 2 years ago

eherot commented 2 years ago

This seems to be a variation of elastic/curator#1491, except in that ticket the offending task has action cluster:admin/snapshot/get whereas what I'm seeing is actually cluster:admin/snapshot/delete showing up in the task list in the debug output:

2022-10-31 05:18:07,919 INFO                  opensearch    log_request_success:270  GET [https://opensearch-host:443/_tasks](https://opensearch-host/_tasks) [status:200 request:0.008s]

2022-10-31 05:18:07,920 DEBUG                 opensearch    log_request_success:273  > None

2022-10-31 05:18:07,920 DEBUG                 opensearch    log_request_success:274  < {"nodes":{"eDduMmwoS_KjdjuDxYXwFQ":{"name":"9c91e0cfc58a9700477875ec387673e5","roles":["master","remote_cluster_client"],"tasks":{"eDduMmwoS_KjdjuDxYXwFQ:40821947":{"node":"eDduMmwoS_KjdjuDxYXwFQ","id":40821947,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":138748,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}},"vAF4bNlqSQqUPn6R68i49w":{"name":"cf400450d8702901b81867fa8a4f9aa2","roles":["data","ingest","remote_cluster_client"],"tasks":{"vAF4bNlqSQqUPn6R68i49w:7038552612":{"node":"vAF4bNlqSQqUPn6R68i49w","id":7038552612,"type":"direct","action":"indices:admin/seq_no/global_checkpoint_sync[p]","start_time_in_millis":1667193487914,"running_time_in_nanos":1469124,"cancellable":false,"parent_task_id":"vAF4bNlqSQqUPn6R68i49w:7038552611","headers":{}},"vAF4bNlqSQqUPn6R68i49w:7038552613":{"node":"vAF4bNlqSQqUPn6R68i49w","id":7038552613,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":66765,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}},"vAF4bNlqSQqUPn6R68i49w:7038552610":{"node":"vAF4bNlqSQqUPn6R68i49w","id":7038552610,"type":"direct","action":"indices:admin/seq_no/global_checkpoint_sync[p]","start_time_in_millis":1667193487914,"running_time_in_nanos":1549810,"cancellable":false,"parent_task_id":"vAF4bNlqSQqUPn6R68i49w:7038552609","headers":{}},"vAF4bNlqSQqUPn6R68i49w:7038552611":{"node":"vAF4bNlqSQqUPn6R68i49w","id":7038552611,"type":"transport","action":"indices:admin/seq_no/global_checkpoint_sync","start_time_in_millis":1667193487914,"running_time_in_nanos":1492986,"cancellable":false,"headers":{}},"vAF4bNlqSQqUPn6R68i49w:7038552609":{"node":"vAF4bNlqSQqUPn6R68i49w","id":7038552609,"type":"transport","action":"indices:admin/seq_no/global_checkpoint_sync","start_time_in_millis":1667193487914,"running_time_in_nanos":1589039,"cancellable":false,"headers":{}},"vAF4bNlqSQqUPn6R68i49w:7038551292":{"node":"vAF4bNlqSQqUPn6R68i49w","id":7038551292,"type":"transport","action":"cluster:admin/snapshot/delete","start_time_in_millis":1667193483644,"running_time_in_nanos":4271393397,"cancellable":false,"headers":{}}}},"uIrwigVjTQ2IsjTrGK1Yog":{"name":"da4bfb27908cf4cae660885b2b580f66","roles":["data","ingest","remote_cluster_client"],"tasks":{"uIrwigVjTQ2IsjTrGK1Yog:2413286951":{"node":"uIrwigVjTQ2IsjTrGK1Yog","id":2413286951,"type":"transport","action":"indices:admin/seq_no/global_checkpoint_sync[r]","start_time_in_millis":1667193487915,"running_time_in_nanos":554491,"cancellable":false,"parent_task_id":"vAF4bNlqSQqUPn6R68i49w:7038552609","headers":{}},"uIrwigVjTQ2IsjTrGK1Yog:2413286950":{"node":"uIrwigVjTQ2IsjTrGK1Yog","id":2413286950,"type":"transport","action":"indices:admin/seq_no/global_checkpoint_sync[r]","start_time_in_millis":1667193487915,"running_time_in_nanos":556961,"cancellable":false,"parent_task_id":"vAF4bNlqSQqUPn6R68i49w:7038552611","headers":{}},"uIrwigVjTQ2IsjTrGK1Yog:2413286952":{"node":"uIrwigVjTQ2IsjTrGK1Yog","id":2413286952,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":56621,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}},"V3_lE7ZVTvOSTKKXmCkAvw":{"name":"7b8d536da350ca7d95ac8afc6d1ae7b2","roles":["data","ingest","remote_cluster_client"],"tasks":{"V3_lE7ZVTvOSTKKXmCkAvw:104694111":{"node":"V3_lE7ZVTvOSTKKXmCkAvw","id":104694111,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":129215,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}},"umukR_drRH-VO9O6vPultA":{"name":"906d8b832773e09c1318526f1ad1a04b","roles":["data","ingest","remote_cluster_client"],"tasks":{"umukR_drRH-VO9O6vPultA:7018216951":{"node":"umukR_drRH-VO9O6vPultA","id":7018216951,"type":"transport","action":"indices:admin/seq_no/global_checkpoint_sync","start_time_in_millis":1667193487915,"running_time_in_nanos":497003,"cancellable":false,"headers":{}},"umukR_drRH-VO9O6vPultA:7018216952":{"node":"umukR_drRH-VO9O6vPultA","id":7018216952,"type":"transport","action":"indices:admin/seq_no/global_checkpoint_sync","start_time_in_millis":1667193487915,"running_time_in_nanos":497392,"cancellable":false,"headers":{}},"umukR_drRH-VO9O6vPultA:7018216953":{"node":"umukR_drRH-VO9O6vPultA","id":7018216953,"type":"direct","action":"indices:admin/seq_no/global_checkpoint_sync[p]","start_time_in_millis":1667193487915,"running_time_in_nanos":452946,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216952","headers":{}},"umukR_drRH-VO9O6vPultA:7018216954":{"node":"umukR_drRH-VO9O6vPultA","id":7018216954,"type":"transport","action":"indices:admin/seq_no/global_checkpoint_sync","start_time_in_millis":1667193487915,"running_time_in_nanos":452812,"cancellable":false,"headers":{}},"umukR_drRH-VO9O6vPultA:7018216955":{"node":"umukR_drRH-VO9O6vPultA","id":7018216955,"type":"direct","action":"indices:admin/seq_no/global_checkpoint_sync[p]","start_time_in_millis":1667193487915,"running_time_in_nanos":435351,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216951","headers":{}},"umukR_drRH-VO9O6vPultA:7018216956":{"node":"umukR_drRH-VO9O6vPultA","id":7018216956,"type":"direct","action":"indices:admin/seq_no/global_checkpoint_sync[p]","start_time_in_millis":1667193487915,"running_time_in_nanos":408329,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216954","headers":{}},"umukR_drRH-VO9O6vPultA:7018216957":{"node":"umukR_drRH-VO9O6vPultA","id":7018216957,"type":"transport","action":"cluster:monitor/tasks/lists","start_time_in_millis":1667193487915,"running_time_in_nanos":150571,"cancellable":false,"headers":{}},"umukR_drRH-VO9O6vPultA:7018216958":{"node":"umukR_drRH-VO9O6vPultA","id":7018216958,"type":"direct","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487915,"running_time_in_nanos":112112,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}},"vJgc8I8pTGq_DuJWQvOfaA":{"name":"cd60a775310c4c56a8fc75aa5a6cdd54","roles":["data","ingest","remote_cluster_client"],"tasks":{"vJgc8I8pTGq_DuJWQvOfaA:105135769":{"node":"vJgc8I8pTGq_DuJWQvOfaA","id":105135769,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":117195,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}},"SPRqpic5Rju_3H4vKomVAg":{"name":"ef4d8f3103323d70514624377a3ceb3b","roles":["data","ingest","remote_cluster_client"],"tasks":{"SPRqpic5Rju_3H4vKomVAg:104622906":{"node":"SPRqpic5Rju_3H4vKomVAg","id":104622906,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":121926,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}},"8mn-rYyvTdmWF-suN6Fe8g":{"name":"00dea730d5639a314ef5a80be1b46234","roles":["master","remote_cluster_client"],"tasks":{"8mn-rYyvTdmWF-suN6Fe8g:116733804":{"node":"8mn-rYyvTdmWF-suN6Fe8g","id":116733804,"type":"transport","action":"cluster:admin/snapshot/delete","start_time_in_millis":1667193483644,"running_time_in_nanos":4271306661,"cancellable":false,"parent_task_id":"vAF4bNlqSQqUPn6R68i49w:7038551292","headers":{}},"8mn-rYyvTdmWF-suN6Fe8g:116733851":{"node":"8mn-rYyvTdmWF-suN6Fe8g","id":116733851,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":149549,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}},"-BG67HJ-QiOr7sLzW6dITQ":{"name":"abdd3de5e6b4a36ae8c70fec4139da55","roles":["master","remote_cluster_client"],"tasks":{"-BG67HJ-QiOr7sLzW6dITQ:173717986":{"node":"-BG67HJ-QiOr7sLzW6dITQ","id":173717986,"type":"transport","action":"cluster:monitor/tasks/lists[n]","start_time_in_millis":1667193487916,"running_time_in_nanos":171014,"cancellable":false,"parent_task_id":"umukR_drRH-VO9O6vPultA:7018216957","headers":{}}}}}}

2022-10-31 05:18:07,921 DEBUG              curator.utils    find_snapshot_tasks:1205 Snapshot activity detected: cluster:admin/snapshot/delete

2022-10-31 05:18:07,922 DEBUG              curator.utils    find_snapshot_tasks:1205 Snapshot activity detected: cluster:admin/snapshot/delete

2022-10-31 05:18:07,922 INFO               curator.utils           safe_to_snap:1232 Snapshot activity detected in Tasks API

2022-10-31 05:18:07,922 INFO               curator.utils           safe_to_snap:1233 Pausing 120 seconds before retrying...

2022-10-31 05:20:08,021 INFO               curator.utils           safe_to_snap:1236 Retry 1 of 1

2022-10-31 05:20:08,022 ERROR                curator.cli                    run:212  Failed to complete action: delete_snapshots.  <class 'curator.exceptions.FailedExecution'>: Unable to delete snapshot(s) because a snapshot is in state "IN_PROGRESS"

It seems like some call is not waiting for the snapshot delete to finish before proceeding. I deliberately turned retries to 1 in order to force the problem to show up more and now I'm going to try bumping it to 10 as a workaround to see if it goes away entirely.

KlavsKlavsen commented 1 year ago

Did it help with retries set to 10?

eherot commented 1 year ago

@KlavsKlavsen it does indeed seem to have helped