Open radykal-com opened 1 year ago
@radykal-com , thanks for reporting this. To help us reproduce this issue, could you please share the checks watchers for your server and client agents?
Sure @huikang
server.json
"watches": [
{
"type": "checks",
"args": ["/server-watches.py"]
}
]
/server-watches.py
#!/usr/bin/env python3
import boto3
import json
import logging
import syslog
from urllib import request
logging.basicConfig(level=logging.INFO)
CONSUL_BASE_URL = "http://127.0.0.1:8500"
CONSUL_HEALTH_PATH = "{}/v1/health/state/critical".format(CONSUL_BASE_URL)
CONSUL_NODES_PATH = "{}/v1//catalog/nodes".format(CONSUL_BASE_URL)
failed_health_nodes = json.loads(request.urlopen(CONSUL_HEALTH_PATH).read())
for check in failed_health_nodes:
msg = "CheckID [{}] from Node [{}] has status [{}] with output: [{}]".format(check["CheckID"], check["Node"], check["Status"], check["Output"])
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
nodes_json_resp = json.loads(request.urlopen(CONSUL_NODES_PATH).read())
ip_addresses_of_nodes_with_critical_health = {node["Address"] for node in nodes_json_resp if
node["Node"] in {node["Node"] for node in failed_health_nodes}}
boto_session = boto3.Session(region_name="xxx")
ec2_client = boto_session.client("ec2")
instances_resp = ec2_client.describe_instances(
Filters=[{
"Name": "private-ip-address",
"Values": [address for address in ip_addresses_of_nodes_with_critical_health]
}]
)
instances_with_critical_health = list()
for reservation in instances_resp["Reservations"]:
for instance in reservation["Instances"]:
instances_with_critical_health.append(instance)
asg_client = boto_session.client("autoscaling")
for instance in instances_with_critical_health:
msg = "Setting instance [{}] as Unhealthy".format(instance["InstanceId"])
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
try:
asg_client.set_instance_health(InstanceId=instance["InstanceId"], HealthStatus="Unhealthy")
except Exception as e:
msg = "Setting instance [{}] as Unhealthy failed: {}".format(instance["InstanceId"], e)
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
client.json
{
"watches": [
{
"type": "checks",
"args": ["/client-watches.py"]
}
]
}
/client-watches.py
#!/usr/bin/env python3
import boto3
import json
import logging
import syslog
from urllib import parse, request
logging.basicConfig(level=logging.INFO)
response = json.loads(request.urlopen('http://127.0.0.1:8500/v1/agent/checks?{}'.format(parse.urlencode({'filter': 'Status!=passing'}))).read())
for check, service_check in response.items():
msg = "{} check is {}".format(check, service_check["Status"])
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
instance_id = request.urlopen('http://169.254.169.254/latest/meta-data/instance-id').read()
asg_client = boto3.client("autoscaling", region_name="xxx")
msg = "Setting instance as Unhealthy"
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
asg_client.set_instance_health(InstanceId=str(instance_id, 'utf-8'), HealthStatus="Unhealthy")
Hi again,
@huikang I've doing some more researching of this issue.
The new setup of our watches is as follow:
Watches scripts (python) configured ONLY in consul servers (3 nodes). The code of those scripts:
#!/usr/bin/env python3
import boto3
import datadog
import json
import logging
import syslog
from urllib import request
logging.basicConfig(level=logging.INFO)
CONSUL_BASE_URL = "http://127.0.0.1:8500"
CONSUL_HEALTH_PATH = "{}/v1/health/state/critical".format(CONSUL_BASE_URL)
CONSUL_NODES_PATH = "{}/v1//catalog/nodes".format(CONSUL_BASE_URL)
METRIC_NAME = "XXX.consul.watches.critical"
failed_health_nodes = json.loads(request.urlopen(CONSUL_HEALTH_PATH).read())
for check in failed_health_nodes:
msg = "CheckID [{}] from Node [{}] has status [{}] with output: [{}]".format(check["CheckID"], check["Node"], check["Status"], check["Output"])
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
nodes_json_resp = json.loads(request.urlopen(CONSUL_NODES_PATH).read())
ip_addresses_of_nodes_with_critical_health = {node["Address"] for node in nodes_json_resp if
node["Node"] in {node["Node"] for node in failed_health_nodes}}
boto_session = boto3.Session(region_name="xxx")
ec2_client = boto_session.client("ec2")
instances_resp = ec2_client.describe_instances(
Filters=[{
"Name": "private-ip-address",
"Values": [address for address in ip_addresses_of_nodes_with_critical_health]
}]
)
statsd_client = datadog.DogStatsd(
host="XXX",
port=8125,
disable_telemetry=True,
)
instances_with_critical_health = list()
for reservation in instances_resp["Reservations"]:
for instance in reservation["Instances"]:
instances_with_critical_health.append(instance)
asg_client = boto_session.client("autoscaling")
for instance in instances_with_critical_health:
msg = "Setting instance [{}] as Unhealthy".format(instance["InstanceId"])
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
try:
asg_client.set_instance_health(InstanceId=instance["InstanceId"], HealthStatus="Unhealthy")
try:
tags = []
if instance["PrivateIpAddress"] in nodes_with_critical_health and "Checks" in nodes_with_critical_health[instance["PrivateIpAddress"]]:
for check in nodes_with_critical_health[instance["PrivateIpAddress"]]["Checks"]:
tags.append("{}:{}".format("check_id", check["CheckID"]))
for tag in instance["Tags"]:
if tag["Key"] == "XXXCustomTagXXX":
tags.append("{}:{}".format("XXX", tag["Value"]))
statsd_client.increment(METRIC_NAME, tags=tags)
except Exception as e:
msg = "Sending metrics failed: {}".format(instance["InstanceId"], e)
logging.info(msg)
syslog.syslog(syslog.LOG_INFO, msg)
except Exception as e:
msg = "Setting instance [{}] as Unhealthy failed: {}".format(instance["InstanceId"], e)
logging.info(msg)
Every time we apply any consul central config the sidecar proxies (envoy) checks pass to critical in all of the services, not sure for how much time, if its just a moment, or some seconds, but it happens to all services after the config change.
It doesn't happen at the same time for all services, it happens during the 10 minutes after the config change. And the check failing is always the one showed in the UI as Connect Sidecar Listening
in our case the CheckID is service:{sidecarproxyname}:1
type tcp
and the output of the check when failing is just empty. That same check when is passing the output is TCP connect 127.0.0.1:21000: Success
This graph shows the critical status triggering for 2 different consul central config changes (8:12 and 8:29). Each color is one of the services
Ah, it looks like you run into the same issue that we discovered just last week. There seems to be a regression introduced by https://github.com/hashicorp/consul/commit/f34703fc63c (added to v1.10), which can cause the sidecar proxy health check to flap, and subsequently cause an increase of Catalog.Register
RPC call, which then cause an increase of disk write activities on the Consul servers.
We are rolling out this fix at the moment:
diff --git a/agent/agent.go b/agent/agent.go
index c3b8570c13..4a666e8d13 100644
--- a/agent/agent.go
+++ b/agent/agent.go
@@ -530,8 +530,7 @@ func (a *Agent) Start(ctx context.Context) error {
}
// Load checks/services/metadata.
- emptyCheckSnapshot := map[structs.CheckID]*structs.HealthCheck{}
- if err := a.loadServices(c, emptyCheckSnapshot); err != nil {
+ if err := a.loadServices(c, nil); err != nil {
return err
}
if err := a.loadChecks(c, nil); err != nil {
@@ -3648,7 +3647,7 @@ func (a *Agent) reloadConfigInternal(newCfg *config.RuntimeConfig) error {
}
// Reload service/check definitions and metadata.
- if err := a.loadServices(newCfg, snap); err != nil {
+ if err := a.loadServices(newCfg, nil); err != nil {
return fmt.Errorf("Failed reloading services: %s", err)
}
if err := a.loadChecks(newCfg, snap); err != nil {
We plan to send a pull request next week.
I'll try that patch and come back with news. Thank you @sayap
Unfortunately the issue isn't fixed with the patch.
I see. We are still on 1.10.12 and the patch seems to work fine. We haven't tried it on 1.14 / 1.15 yet, will take a look..
Just verified that the patch works fine for me on 1.15.2 as well.
Let's elaborate a bit what the patch does, based on my incomplete understanding...
In agent/service_manager.go, agent will make a ConfigEntry.ResolvedServiceConfig
blocking query for each of the sidecar proxy services. If the particular service config for the sidecar proxy doesn't change in 10 minute, the existing blocking query will time out, and a new blocking query will be created.
However, if there has been any config change in the 10 minutes (even if the changes are totally irrelevant to the sidecar proxy), the function handleUpdate
in agent/service_manager.go will be triggered, which will then cause a supposedly no-op call to agent.addServiceInternal
.
Due to the regression introduced by commit f34703fc63c, this call is not no-op anymore, but will instead reset the sidecar proxy's health checks to either:
emptyCheckSnapshot
, since the empty map will prevent the status restore in https://github.com/hashicorp/consul/blob/f34703fc63c/agent/agent.go#L2048-L2053 from happening.This health check flapping causes INFO log lines that look like these:
2023-05-26T12:37:48.316+0700 [INFO] agent: Synced check: check=service:dashboard-sidecar-proxy:1
2023-05-26T12:37:48.325+0700 [INFO] agent: Synced check: check=service:dashboard-sidecar-proxy:2
The checks run every 10 seconds, so they will revert to their original passing status within 10 seconds, which then causes more INFO log lines:
2023-05-26T12:37:52.781+0700 [INFO] agent: Synced check: check=service:dashboard-sidecar-proxy:1
"sidecar-proxy:1" is a TCP check on the Envoy's public listener port (i.e. Connect Sidecar Listening), and a TCP check has an initial pause time. Thus, it has a first "Synced check" that sends the critical status to the Consul servers, followed by a second "Synced check" a few seconds later that sends the passing status to the Consul servers. This creates unnecessary network traffic and also unnecessary disk writes on the Consul servers. In your case, it also impacts the watcher script.
"sidecar-proxy:2" is an Alias check for the actual service, and an Alias check has no initial pause time. So, while the regression causes the check to be in the critical status, the check will run immediately and go back to the passing status. Thus, it only has one "Synced check" log line that sends the passing status to the Consul servers. This create unnecessary network traffic, but shouldn't create any disk write on the Consul servers, since the status remains the same from the perspective of the Consul servers.
With the patch, these "Synced check" log lines shouldn't happen anymore after some irrelevant config change. I am quite curious why the issue still persist after applying the patch, maybe you can share the log from one of the patched agent? (the patch is only needed for Consul agents, not the Consul servers)
Hey @sayap you are right. Yesterday I applied the patch to just a subset of the platform and I couldn't see a real impact on the number of checks triggered during the changes.
Today I've applied it to all of the platform and effectively, there are no checks triggering on config changes.
I'm a bit worried if there would be side or undesired effects with this change that may lead to new issues or problems.
anyway, thank you very much as I started to think the problem was not in consul but in our deployment.
That's great :grin:
A side effect is that the agents will do a tiny bit more work, as now it has to do req.checkStateSnapshot = a.State.AllChecks()
every time handleUpdate
is triggered. And that was the original behavior before commit f34703fc63c, so I won't worry too much about it.
@huikang we're sill using @sayap hack in version 1.17.2. Is there any related issue or fix coming?
Overview of the Issue
Similar to https://github.com/hashicorp/consul/issues/7446 but it's happening when applying any change to central config entries using the consul cli (config write, config delete)
We have checks watchers configured in consul servers and clients:
/v1/health/state/critical
) and if they find any they will mark the node as unhealthy so it can be replaced by the autoscaling group/v1/agent/checks?filter=Status!=passing
) and if any is in non passing status they mark themselves as an unhealthy node so it can be replaced by the autoscaling groupEvery time we apply a new config entry with the consul cli (consul config write or consul config delete) the watchers are triggered in servers and clients, and the watchers scripts find checks in critical status so leading to the replacement of all of our nodes.
If we don't make any consul config change, it works fine, the unhealthy nodes are replaced when they need to, new nodes join without problems, etc.
Reproduction Steps
Consul info for both Client and Server
Client info
``` agent: check_monitors = 0 check_ttls = 0 checks = 3 services = 2 build: prerelease = revision = version = 1.15.1 version_metadata = consul: acl = disabled known_servers = 3 server = false runtime: arch = amd64 cpu_count = 2 goroutines = 98 max_procs = 2 os = linux version = go1.20.1 serf_lan: coordinate_resets = 0 encrypted = false event_queue = 0 event_time = 34 failed = 0 health_score = 0 intent_queue = 0 left = 5 member_time = 2380157 members = 84 query_queue = 0 query_time = 1 ```Server info
``` agent: check_monitors = 0 check_ttls = 0 checks = 0 services = 0 build: prerelease = revision = version = 1.15.1 version_metadata = consul: acl = disabled bootstrap = false known_datacenters = 1 leader = false leader_addr = 172.31.50.254:8300 server = true raft: applied_index = 23759419 commit_index = 23759419 fsm_pending = 0 last_contact = 7.852926ms last_log_index = 23759419 last_log_term = 55 last_snapshot_index = 23750571 last_snapshot_term = 55 latest_configuration = [{Suffrage:Voter ID:7e97476a-94dd-783f-0645-a9e78861ae6d Address:172.31.50.254:8300} {Suffrage:Voter ID:858454d3-9a87-48ef-c342-05f954076701 Address:172.31.77.53:8300} {Suffrage:Voter ID:d1d55041-728a-6e70-aee7-56421f986ac1 Address:172.31.61.5:8300}] latest_configuration_index = 0 num_peers = 2 protocol_version = 3 protocol_version_max = 3 protocol_version_min = 0 snapshot_version_max = 1 snapshot_version_min = 0 state = Follower term = 55 runtime: arch = amd64 cpu_count = 2 goroutines = 1191 max_procs = 2 os = linux version = go1.20.1 serf_lan: coordinate_resets = 0 encrypted = false event_queue = 0 event_time = 34 failed = 0 health_score = 0 intent_queue = 0 left = 8 member_time = 2380163 members = 84 query_queue = 0 query_time = 1 serf_wan: coordinate_resets = 0 encrypted = false event_queue = 0 event_time = 1 failed = 0 health_score = 0 intent_queue = 0 left = 0 member_time = 25790 members = 3 query_queue = 0 query_time = 1 ```Operating system and Environment details
3 consul servers and ~80 consul clients OS: Ubuntu 20.04 Arch: x86_64 Reproduced with consul versions:
Central config entries:
envoy_dogstatsd_url
configLog Fragments