Open huoarter opened 5 years ago
{ "node_id": "VCMPiqWZSYW4hnNDj_NExg", "node_name": "es-1", "transport_address": "127.0.0.1:9300", "node_attributes": { "tag": "warm" }, "node_decision": "no", "store": { "in_sync": true, "allocation_id": "zZHXkwouS_SPJUmLzg3nWQ", "store_exception": { "type": "shard_lock_obtain_failed_exception", "reason": "[test-test][3]: obtaining shard lock timed out after 5000ms", "index_uuid": "HI8Z5vAdTqmM8rfw_JT0Lw", "shard": "3", "index": "test-test" } }, "deciders": [ { "decider": "max_retry", "decision": "NO", "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2018-04-18T07:09:02.434Z], failed_attempts[10], delayed=false, details[failed to create shard, failure IOException[failed to obtain in-memory shard lock]; nested: ShardLockObtainFailedException[test-test][3]: obtaining shard lock timed out after 5000ms]; ], allocation_status[deciders_no]]]" } ] }
ES 版本: 5.2.1 curl http://elasticsearch.aliyuncs.com:9200/_cluster/reroute?retry_failed=true -XPOST 步骤: curl localhost:9200/_cat/shards > shards 跑脚本:nohup python recovery.py & 注意:跑脚本过程会返回大量json,时间较长,请注意放入后台 查看修复shard进度:curl 127.0.0.1:9200/_cat/recovery/你修复shard对应的索引 结果: 找到索引对应的shard,看到existing_store done说明已经从本地修复 index 19 268ms existing_store done n/a n/a 10.0.58.67 node_name #!/usr/bin/env python #name: recovery.py import requests import json host = "http://localhost:9200/_cluster/allocation/explain" s= requests.Session() def reroute_shard(index,shard,node): data = { "commands" : [ { "allocate_stale_primary" : { "index" : index, "shard" : shard, "node" : node, "accept_data_loss": True } } ] } print data url = "http://localhost:9200/_cluster/reroute" res = s.post(url,json=data) print res def get_node(line): if "UNASSIGNED" in line: line = line.split() index = line[0] shard = line[1] if line[2] != "p": return body = { "index": index, "shard": shard, "primary": True } res = s.get(host, json = body) for store in res.json().get("node_allocation_decisions"): if store.get("store").get("allocation_id"): node_name = store.get("node_name") reroute_shard(index,shard,node_name) else: return with open("shards", 'rb') as f: map(get_node,f) 相关文档: https://www.elastic.co/guide/en/elasticsearch/reference/5.2/cluster-reroute.html https://www.elastic.co/guide/en/elasticsearch/reference/5.2/cluster-allocation-explain.html
{ "node_id": "VCMPiqWZSYW4hnNDj_NExg", "node_name": "es-1", "transport_address": "127.0.0.1:9300", "node_attributes": { "tag": "warm" }, "node_decision": "no", "store": { "in_sync": true, "allocation_id": "zZHXkwouS_SPJUmLzg3nWQ", "store_exception": { "type": "shard_lock_obtain_failed_exception", "reason": "[test-test][3]: obtaining shard lock timed out after 5000ms", "index_uuid": "HI8Z5vAdTqmM8rfw_JT0Lw", "shard": "3", "index": "test-test" } }, "deciders": [ { "decider": "max_retry", "decision": "NO", "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2018-04-18T07:09:02.434Z], failed_attempts[10], delayed=false, details[failed to create shard, failure IOException[failed to obtain in-memory shard lock]; nested: ShardLockObtainFailedException[test-test][3]: obtaining shard lock timed out after 5000ms]; ], allocation_status[deciders_no]]]" } ] }