prometheus-community / elasticsearch_exporter

Elasticsearch stats exporter for Prometheus
Apache License 2.0
1.94k stars 791 forks source link

Add metric for cluster.max_shards_per_node #271

Closed tetianakravchenko closed 5 years ago

tetianakravchenko commented 5 years ago

In elasticsearch 7.x was introduced soft limit on the number of shards in a cluster https://www.elastic.co/guide/en/elasticsearch/reference/7.0/misc-cluster.html#cluster-shard-limit. As mentioned in the documentation reaching this limitation can lead to failures:

If the cluster is already over the limit, due to changes in node membership or setting changes, all operations that create or open indices will fail until either the limit is increased ..., or some indices are closed or deleted to bring the number of shards below the limit.

So it might be useful to have dashboard/alert to indicate shards usage before the actual failures.

zwopir commented 5 years ago

Hi @tetianakravchenko,

I agree this is a useful metric. Unfortunately I don't have a 7.x cluster at hand. Can you send me the output of a curl http://<a_cluster_node_addr>:9200/_cluster/settings\?include_defaults=true | jq . so I can compare it to the ES 5.x output (which I have at hand). Until now the exporter doesn't really distinguish between ES versions. If the output differs significantly, we have to see how to handle it

tetianakravchenko commented 5 years ago

Hi @zwopir. Used ES version is 7.1.1, here is an output:

http://localhost:9200/_cluster/settings\?include_defaults=true | jq .
{
  "persistent": {},
  "transient": {},
  "defaults": {
    "cluster": {
      "max_voting_config_exclusions": "10",
      "auto_shrink_voting_configuration": "true",
      "election": {
        "duration": "500ms",
        "initial_timeout": "100ms",
        "max_timeout": "10s",
        "back_off_time": "100ms"
      },
      "no_master_block": "write",
      "persistent_tasks": {
        "allocation": {
          "enable": "all",
          "recheck_interval": "30s"
        }
      },
      "blocks": {
        "read_only_allow_delete": "false",
        "read_only": "false"
      },
      "remote": {
        "node": {
          "attr": ""
        },
        "initial_connect_timeout": "30s",
        "connect": "true",
        "connections_per_cluster": "3"
      },
      "follower_lag": {
        "timeout": "90000ms"
      },
      "routing": {
        "use_adaptive_replica_selection": "true",
        "rebalance": {
          "enable": "all"
        },
        "allocation": {
          "node_concurrent_incoming_recoveries": "2",
          "node_initial_primaries_recoveries": "4",
          "same_shard": {
            "host": "false"
          },
          "total_shards_per_node": "-1",
          "type": "balanced",
          "disk": {
            "threshold_enabled": "true",
            "watermark": {
              "low": "85%",
              "flood_stage": "95%",
              "high": "90%"
            },
            "include_relocations": "true",
            "reroute_interval": "60s"
          },
          "awareness": {
            "attributes": []
          },
          "balance": {
            "index": "0.55",
            "threshold": "1.0",
            "shard": "0.45"
          },
          "enable": "all",
          "node_concurrent_outgoing_recoveries": "2",
          "allow_rebalance": "indices_all_active",
          "cluster_concurrent_rebalance": "2",
          "node_concurrent_recoveries": "2"
        }
      },
      "indices": {
        "tombstones": {
          "size": "500"
        },
        "close": {
          "enable": "true"
        }
      },
      "nodes": {
        "reconnect_interval": "10s"
      },
      "service": {
        "slow_task_logging_threshold": "30s"
      },
      "publish": {
        "timeout": "30000ms"
      },
      "name": "search",
      "fault_detection": {
        "leader_check": {
          "interval": "1000ms",
          "timeout": "10000ms",
          "retry_count": "3"
        },
        "follower_check": {
          "interval": "1000ms",
          "timeout": "10000ms",
          "retry_count": "3"
        }
      },
      "join": {
        "timeout": "60000ms"
      },
      "max_shards_per_node": "1000",
      "initial_master_nodes": [],
      "info": {
        "update": {
          "interval": "30s",
          "timeout": "15s"
        }
      }
    },
    "no": {
      "model": {
        "state": {
          "persist": "false"
        }
      }
    },
    "logger": {
      "level": "INFO"
    },
    "bootstrap": {
      "memory_lock": "false",
      "system_call_filter": "true",
      "ctrlhandler": "true"
    },
    "processors": "1",
    "ingest": {
      "geoip": {
        "cache_size": "1000"
      },
      "grok": {
        "watchdog": {
          "max_execution_time": "1s",
          "interval": "1s"
        }
      }
    },
    "network": {
      "host": [
        "0.0.0.0"
      ],
      "tcp": {
        "reuse_address": "true",
        "keep_alive": "true",
        "connect_timeout": "30s",
        "receive_buffer_size": "-1b",
        "no_delay": "true",
        "send_buffer_size": "-1b"
      },
      "bind_host": [
        "0.0.0.0"
      ],
      "server": "true",
      "breaker": {
        "inflight_requests": {
          "limit": "100%",
          "overhead": "2.0"
        }
      },
      "publish_host": [
        "0.0.0.0"
      ]
    },
    "pidfile": "",
    "path": {
      "data": [],
      "logs": "/usr/share/elasticsearch/logs",
      "shared_data": "",
      "home": "/usr/share/elasticsearch",
      "repo": []
    },
    "search": {
      "default_search_timeout": "-1",
      "highlight": {
        "term_vector_multi_value": "true"
      },
      "default_allow_partial_results": "true",
      "max_open_scroll_context": "500",
      "max_buckets": "10000",
      "low_level_cancellation": "false",
      "keep_alive_interval": "1m",
      "remote": {
        "node": {
          "attr": ""
        },
        "initial_connect_timeout": "30s",
        "connect": "true",
        "connections_per_cluster": "3"
      },
      "default_keep_alive": "5m",
      "max_keep_alive": "24h"
    },
    "security": {
      "manager": {
        "filter_bad_defaults": "true"
      }
    },
    "ccr": {
      "wait_for_metadata_timeout": "60s",
      "indices": {
        "recovery": {
          "recovery_activity_timeout": "60s",
          "chunk_size": "1mb",
          "internal_action_timeout": "60s",
          "max_bytes_per_sec": "40mb",
          "max_concurrent_file_chunks": "5"
        }
      },
      "auto_follow": {
        "wait_for_metadata_timeout": "60s"
      }
    },
    "repositories": {
      "fs": {
        "compress": "false",
        "chunk_size": "9223372036854775807b",
        "location": ""
      },
      "url": {
        "supported_protocols": [
          "http",
          "https",
          "ftp",
          "file",
          "jar"
        ],
        "allowed_urls": [],
        "url": "http:"
      }
    },
    "action": {
      "auto_create_index": "true",
      "search": {
        "shard_count": {
          "limit": "9223372036854775807"
        }
      },
      "destructive_requires_name": "false"
    },
    "client": {
      "type": "node",
      "transport": {
        "ignore_cluster_name": "false",
        "nodes_sampler_interval": "5s",
        "sniff": "false",
        "ping_timeout": "5s"
      }
    },
    "xpack": {
      "watcher": {
        "execution": {
          "scroll": {
            "size": "0",
            "timeout": ""
          },
          "default_throttle_period": "5s"
        },
        "internal": {
          "ops": {
            "bulk": {
              "default_timeout": ""
            },
            "index": {
              "default_timeout": ""
            },
            "search": {
              "default_timeout": ""
            }
          }
        },
        "thread_pool": {
          "queue_size": "1000",
          "size": "1"
        },
        "index": {
          "rest": {
            "direct_access": ""
          }
        },
        "history": {
          "cleaner_service": {
            "enabled": "true"
          }
        },
        "trigger": {
          "schedule": {
            "ticker": {
              "tick_interval": "500ms"
            }
          }
        },
        "enabled": "true",
        "input": {
          "search": {
            "default_timeout": ""
          }
        },
        "encrypt_sensitive_data": "false",
        "transform": {
          "search": {
            "default_timeout": ""
          }
        },
        "stop": {
          "timeout": "30s"
        },
        "watch": {
          "scroll": {
            "size": "0"
          }
        },
        "bulk": {
          "concurrent_requests": "0",
          "flush_interval": "1s",
          "size": "1mb",
          "actions": "1"
        },
        "actions": {
          "bulk": {
            "default_timeout": ""
          },
          "index": {
            "default_timeout": ""
          }
        }
      },
      "ilm": {
        "enabled": "true"
      },
      "monitoring": {
        "collection": {
          "cluster": {
            "stats": {
              "timeout": "10s"
            }
          },
          "node": {
            "stats": {
              "timeout": "10s"
            }
          },
          "indices": [],
          "ccr": {
            "stats": {
              "timeout": "10s"
            }
          },
          "index": {
            "stats": {
              "timeout": "10s"
            },
            "recovery": {
              "active_only": "false",
              "timeout": "10s"
            }
          },
          "interval": "10s",
          "enabled": "false",
          "ml": {
            "job": {
              "stats": {
                "timeout": "10s"
              }
            }
          }
        },
        "history": {
          "duration": "168h"
        },
        "elasticsearch": {
          "collection": {
            "enabled": "true"
          }
        },
        "enabled": "true"
      },
      "graph": {
        "enabled": "true"
      },
      "rollup": {
        "enabled": "true",
        "task_thread_pool": {
          "queue_size": "4",
          "size": "4"
        }
      },
      "sql": {
        "enabled": "true"
      },
      "license": {
        "self_generated": {
          "type": "basic"
        }
      },
      "logstash": {
        "enabled": "true"
      },
      "notification": {
        "pagerduty": {
          "default_account": ""
        },
        "email": {
          "default_account": "",
          "html": {
            "sanitization": {
              "allow": [
                "body",
                "head",
                "_tables",
                "_links",
                "_blocks",
                "_formatting",
                "img:embedded"
              ],
              "disallow": [],
              "enabled": "true"
            }
          }
        },
        "reporting": {
          "retries": "40",
          "interval": "15s"
        },
        "jira": {
          "default_account": ""
        },
        "slack": {
          "default_account": ""
        }
      },
      "security": {
        "dls_fls": {
          "enabled": "true"
        },
        "transport": {
          "filter": {
            "allow": [],
            "deny": [],
            "enabled": "true"
          },
          "ssl": {
            "enabled": "false"
          }
        },
        "enabled": "true",
        "filter": {
          "always_allow_bound_address": "true"
        },
        "encryption": {
          "algorithm": "AES/CTR/NoPadding"
        },
        "audit": {
          "enabled": "false",
          "logfile": {
            "emit_node_id": "true",
            "emit_node_host_name": "false",
            "emit_node_name": "false",
            "events": {
              "emit_request_body": "false",
              "include": [
                "ACCESS_DENIED",
                "ACCESS_GRANTED",
                "ANONYMOUS_ACCESS_DENIED",
                "AUTHENTICATION_FAILED",
                "CONNECTION_DENIED",
                "TAMPERED_REQUEST",
                "RUN_AS_DENIED",
                "RUN_AS_GRANTED"
              ],
              "exclude": []
            },
            "emit_node_host_address": "false"
          }
        },
        "authc": {
          "password_hashing": {
            "algorithm": "bcrypt"
          },
          "success_cache": {
            "size": "10000",
            "enabled": "true",
            "expire_after_access": "1h"
          },
          "api_key": {
            "cache": {
              "hash_algo": "ssha256",
              "max_keys": "10000",
              "ttl": "24h"
            },
            "delete": {
              "interval": "24h",
              "timeout": "-1"
            },
            "enabled": "false",
            "hashing": {
              "algorithm": "pbkdf2"
            }
          },
          "anonymous": {
            "authz_exception": "true",
            "roles": [],
            "username": "_anonymous"
          },
          "run_as": {
            "enabled": "true"
          },
          "reserved_realm": {
            "enabled": "true"
          },
          "token": {
            "delete": {
              "interval": "30m",
              "timeout": "-1"
            },
            "enabled": "false",
            "thread_pool": {
              "queue_size": "1000",
              "size": "1"
            },
            "timeout": "20m"
          }
        },
        "fips_mode": {
          "enabled": "false"
        },
        "encryption_key": {
          "length": "128",
          "algorithm": "AES"
        },
        "http": {
          "filter": {
            "allow": [],
            "deny": [],
            "enabled": "true"
          },
          "ssl": {
            "enabled": "false"
          }
        },
        "automata": {
          "max_determinized_states": "100000",
          "cache": {
            "size": "10000",
            "ttl": "48h",
            "enabled": "true"
          }
        },
        "user": null,
        "authz": {
          "store": {
            "roles": {
              "index": {
                "cache": {
                  "ttl": "20m",
                  "max_size": "10000"
                }
              },
              "cache": {
                "max_size": "10000"
              },
              "negative_lookup_cache": {
                "max_size": "10000"
              },
              "field_permissions": {
                "cache": {
                  "max_size_in_bytes": "104857600"
                }
              }
            }
          }
        }
      },
      "ccr": {
        "enabled": "true",
        "ccr_thread_pool": {
          "queue_size": "100",
          "size": "32"
        }
      },
      "http": {
        "default_connection_timeout": "10s",
        "proxy": {
          "host": "",
          "scheme": "",
          "port": "0"
        },
        "whitelist": [
          "*"
        ],
        "default_read_timeout": "10s",
        "max_response_size": "10mb"
      },
      "ml": {
        "utility_thread_pool": {
          "queue_size": "500",
          "size": "80"
        },
        "max_anomaly_records": "500",
        "enable_config_migration": "true",
        "max_open_jobs": "20",
        "min_disk_space_off_heap": "5gb",
        "node_concurrent_job_allocations": "2",
        "max_model_memory_limit": "0b",
        "enabled": "true",
        "max_lazy_ml_nodes": "0",
        "max_machine_memory_percent": "30",
        "autodetect_process": "true",
        "datafeed_thread_pool": {
          "queue_size": "200",
          "size": "20"
        },
        "autodetect_thread_pool": {
          "queue_size": "80",
          "size": "80"
        }
      }
    },
    "rest": {
      "action": {
        "multi": {
          "allow_explicit_index": "true"
        }
      }
    },
    "cache": {
      "recycler": {
        "page": {
          "limit": {
            "heap": "10%"
          },
          "type": "CONCURRENT",
          "weight": {
            "longs": "1.0",
            "ints": "1.0",
            "bytes": "1.0",
            "objects": "0.1"
          }
        }
      }
    },
    "reindex": {
      "remote": {
        "whitelist": []
      }
    },
    "resource": {
      "reload": {
        "enabled": "true",
        "interval": {
          "low": "60s",
          "high": "5s",
          "medium": "30s"
        }
      }
    },
    "thread_pool": {
      "force_merge": {
        "queue_size": "-1",
        "size": "1"
      },
      "fetch_shard_started": {
        "core": "1",
        "max": "2",
        "keep_alive": "5m"
      },
      "listener": {
        "queue_size": "-1",
        "size": "1"
      },
      "refresh": {
        "core": "1",
        "max": "1",
        "keep_alive": "5m"
      },
      "generic": {
        "core": "4",
        "max": "128",
        "keep_alive": "30s"
      },
      "warmer": {
        "core": "1",
        "max": "1",
        "keep_alive": "5m"
      },
      "search": {
        "max_queue_size": "1000",
        "queue_size": "1000",
        "size": "2",
        "auto_queue_frame_size": "2000",
        "target_response_time": "1s",
        "min_queue_size": "1000"
      },
      "fetch_shard_store": {
        "core": "1",
        "max": "2",
        "keep_alive": "5m"
      },
      "flush": {
        "core": "1",
        "max": "1",
        "keep_alive": "5m"
      },
      "management": {
        "core": "1",
        "max": "5",
        "keep_alive": "5m"
      },
      "analyze": {
        "queue_size": "16",
        "size": "1"
      },
      "get": {
        "queue_size": "1000",
        "size": "1"
      },
      "estimated_time_interval": "200ms",
      "write": {
        "queue_size": "200",
        "size": "1"
      },
      "snapshot": {
        "core": "1",
        "max": "1",
        "keep_alive": "5m"
      },
      "search_throttled": {
        "max_queue_size": "100",
        "queue_size": "100",
        "size": "1",
        "auto_queue_frame_size": "200",
        "target_response_time": "1s",
        "min_queue_size": "100"
      }
    },
    "index": {
      "codec": "default",
      "store": {
        "type": "",
        "fs": {
          "fs_lock": "native"
        },
        "preload": []
      }
    },
    "monitor": {
      "jvm": {
        "gc": {
          "enabled": "true",
          "overhead": {
            "warn": "50",
            "debug": "10",
            "info": "25"
          },
          "refresh_interval": "1s"
        },
        "refresh_interval": "1s"
      },
      "process": {
        "refresh_interval": "1s"
      },
      "os": {
        "refresh_interval": "1s"
      },
      "fs": {
        "refresh_interval": "1s"
      }
    },
    "transport": {
      "tcp": {
        "reuse_address": "true",
        "connect_timeout": "30s",
        "compress": "false",
        "port": "9300-9400",
        "no_delay": "true",
        "keep_alive": "true",
        "receive_buffer_size": "-1b",
        "send_buffer_size": "-1b"
      },
      "bind_host": [],
      "connect_timeout": "30s",
      "compress": "false",
      "ping_schedule": "-1",
      "connections_per_node": {
        "recovery": "2",
        "state": "1",
        "bulk": "3",
        "reg": "6",
        "ping": "1"
      },
      "tracer": {
        "include": [],
        "exclude": [
          "internal:discovery/zen/fd*",
          "internal:coordination/fault_detection/*",
          "cluster:monitor/nodes/liveness"
        ]
      },
      "type": "security4",
      "type.default": "netty4",
      "features": {
        "x-pack": "true"
      },
      "port": "9300-9400",
      "host": [],
      "publish_port": "-1",
      "tcp_no_delay": "true",
      "publish_host": [],
      "netty": {
        "receive_predictor_size": "64kb",
        "receive_predictor_max": "64kb",
        "worker_count": "2",
        "receive_predictor_min": "64kb",
        "boss_count": "1"
      }
    },
    "script": {
      "allowed_contexts": [],
      "max_compilations_rate": "75/5m",
      "cache": {
        "max_size": "100",
        "expire": "0ms"
      },
      "painless": {
        "regex": {
          "enabled": "false"
        }
      },
      "max_size_in_bytes": "65535",
      "allowed_types": []
    },
    "node": {
      "data": "false",
      "enable_lucene_segment_infos_trace": "false",
      "local_storage": "true",
      "max_local_storage_nodes": "1",
      "name": "search-client-1",
      "id": {
        "seed": "0"
      },
      "store": {
        "allow_mmap": "true"
      },
      "attr": {
        "xpack": {
          "installed": "true"
        },
        "ml": {
          "machine_memory": "2147483648",
          "max_open_jobs": "20"
        }
      },
      "portsfile": "false",
      "ingest": "false",
      "master": "false",
      "ml": "true"
    },
    "indices": {
      "lifecycle": {
        "poll_interval": "10m"
      },
      "cache": {
        "cleanup_interval": "1m"
      },
      "mapping": {
        "dynamic_timeout": "30s"
      },
      "memory": {
        "interval": "5s",
        "max_index_buffer_size": "-1",
        "shard_inactive_time": "5m",
        "index_buffer_size": "10%",
        "min_index_buffer_size": "48mb"
      },
      "breaker": {
        "request": {
          "limit": "60%",
          "type": "memory",
          "overhead": "1.0"
        },
        "total": {
          "limit": "95%",
          "use_real_memory": "true"
        },
        "accounting": {
          "limit": "100%",
          "overhead": "1.0"
        },
        "fielddata": {
          "limit": "40%",
          "type": "memory",
          "overhead": "1.03"
        },
        "type": "hierarchy"
      },
      "fielddata": {
        "cache": {
          "size": "-1b"
        }
      },
      "query": {
        "bool": {
          "max_clause_count": "1024"
        },
        "query_string": {
          "analyze_wildcard": "false",
          "allowLeadingWildcard": "true"
        }
      },
      "recovery": {
        "recovery_activity_timeout": "1800000ms",
        "retry_delay_network": "5s",
        "internal_action_timeout": "15m",
        "retry_delay_state_sync": "500ms",
        "internal_action_long_timeout": "1800000ms",
        "max_bytes_per_sec": "40mb",
        "max_concurrent_file_chunks": "2"
      },
      "requests": {
        "cache": {
          "size": "1%",
          "expire": "0ms"
        }
      },
      "store": {
        "delete": {
          "shard": {
            "timeout": "30s"
          }
        }
      },
      "analysis": {
        "hunspell": {
          "dictionary": {
            "ignore_case": "false",
            "lazy": "false"
          }
        }
      },
      "queries": {
        "cache": {
          "count": "10000",
          "size": "10%",
          "all_segments": "false"
        }
      }
    },
    "plugin": {
      "mandatory": []
    },
    "discovery": {
      "seed_hosts": [
        "search-master-headless"
      ],
      "unconfigured_bootstrap_timeout": "3s",
      "request_peers_timeout": "3000ms",
      "zen": {
        "commit_timeout": "30s",
        "no_master_block": "write",
        "join_retry_delay": "100ms",
        "join_retry_attempts": "3",
        "ping": {
          "unicast": {
            "concurrent_connects": "10",
            "hosts": [],
            "hosts.resolve_timeout": "5s"
          }
        },
        "master_election": {
          "ignore_non_master_pings": "false",
          "wait_for_joins_timeout": "30000ms"
        },
        "send_leave_request": "true",
        "ping_timeout": "3s",
        "bwc_ping_timeout": "3s",
        "join_timeout": "60000ms",
        "publish_diff": {
          "enable": "true"
        },
        "publish": {
          "max_pending_cluster_states": "25"
        },
        "minimum_master_nodes": "-1",
        "unsafe_rolling_upgrades_enabled": "true",
        "hosts_provider": [],
        "publish_timeout": "30s",
        "fd": {
          "connect_on_network_disconnect": "false",
          "ping_interval": "1s",
          "ping_retries": "3",
          "register_connection_listener": "true",
          "ping_timeout": "30s"
        },
        "max_pings_from_another_master": "3"
      },
      "initial_state_timeout": "30s",
      "cluster_formation_warning_timeout": "10000ms",
      "seed_providers": [],
      "type": "zen",
      "seed_resolver": {
        "max_concurrent_resolvers": "10",
        "timeout": "5s"
      },
      "find_peers_interval": "1000ms"
    },
    "http": {
      "tcp": {
        "reuse_address": "true",
        "keep_alive": "true",
        "receive_buffer_size": "-1b",
        "no_delay": "true",
        "send_buffer_size": "-1b"
      },
      "bind_host": [],
      "cors": {
        "max-age": "1728000",
        "allow-origin": "",
        "allow-headers": "X-Requested-With,Content-Type,Content-Length",
        "allow-credentials": "false",
        "allow-methods": "OPTIONS,HEAD,GET,POST,PUT,DELETE",
        "enabled": "false"
      },
      "max_chunk_size": "8kb",
      "compression_level": "3",
      "reset_cookies": "false",
      "max_initial_line_length": "4kb",
      "max_warning_header_count": "-1",
      "type": "security4",
      "pipelining": {
        "max_events": "10000"
      },
      "max_warning_header_size": "-1b",
      "type.default": "netty4",
      "detailed_errors": {
        "enabled": "true"
      },
      "content_type": {
        "required": "true"
      },
      "port": "9200-9300",
      "host": [],
      "publish_port": "-1",
      "max_header_size": "8kb",
      "tcp_no_delay": "true",
      "compression": "true",
      "read_timeout": "0ms",
      "publish_host": [],
      "max_content_length": "100mb",
      "netty": {
        "receive_predictor_size": "64kb",
        "max_composite_buffer_components": "69905",
        "worker_count": "2"
      }
    },
    "gateway": {
      "recover_after_master_nodes": "0",
      "expected_nodes": "-1",
      "recover_after_data_nodes": "-1",
      "expected_data_nodes": "-1",
      "recover_after_time": "0ms",
      "expected_master_nodes": "-1",
      "recover_after_nodes": "-1"
    }
  }
}
tetianakravchenko commented 5 years ago

@zwopir hi, is it possible to get pr that fixes this issue released?