Open nwmqpa opened 2 months ago
Hi @nwmqpa, I was able to reproduce this using a mixed architecture environment in running in Docker.
# docker-compose.yaml
---
version: "3.8"
services:
consul_amd64:
# hashicorp/consul:1.18.1
# OS/ARCH: linux/amd64
image: hashicorp/consul@sha256:7fce167b0d99d1b2b1288d8dec0ef4dd7d787b6f136c8d6bc59fee077b055734
platform: linux/amd64
environment:
CONSUL_BIND_INTERFACE: eth0
labels:
traefik.enable: true
traefik.http.services.consul.loadbalancer.server.port: 8500
consul_arm64:
# hashicorp/consul:1.18.1
# OS/ARCH: linux/arm64
image: hashicorp/consul@sha256:d094ee4455ebe0fcd33cd0de6dd02f8ba6e79ae3406ad5acb341b3cf62fcceda
platform: linux/arm64
depends_on:
- consul_amd64
environment:
CONSUL_BIND_INTERFACE: eth0
CONSUL_LOCAL_CONFIG: |
{
"retry_join": ["consul"],
"server": false
}
labels:
traefik.enable: true
traefik.http.services.consul.loadbalancer.server.port: 8500
links:
- consul_amd64:consul
traefik:
image: traefik:v2.11.2
ports:
- "8000:8000/tcp"
- "8080:8080/tcp"
environment:
TRAEFIK_ACCESSLOG: 'true'
TRAEFIK_API_INSECURE: 'true'
TRAEFIK_PROVIDERS_DOCKER: 'true'
TRAEFIK_ENTRYPOINTS_web: 'true'
TRAEFIK_ENTRYPOINTS_web_ADDRESS: ":8000"
labels:
traefik.http.routers.consul.rule: PathPrefix(`/`)
traefik.http.routers.consul.entrypoints: web
traefik.http.routers.consul.service: consul
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
After running docker-compose up
, the Consul UI will be accessible on http://localhost:8000/. Requests to the UI will be distributed across the servers using round robin load balancing.
The issue is that a few of the assets have a unique content hash in the filename and these asset names differ across the supported OS and architecture specific builds of Consul for a given release.
(Each curl request is being routed to a different server.)
$ diff --unified <(curl http://localhost:8000/ui/ | grep '/ui/assets/chunk') <(curl http://localhost:8000/ui/ | grep '/ui/assets/chunk')
--- /dev/fd/11 2024-04-26 17:30:00
+++ /dev/fd/12 2024-04-26 17:30:00
@@ -1,3 +1,3 @@
-<link rel="stylesheet" href="/ui/assets/chunk.143.4a0b6b80ab1fa229e8ae.css"/>
+<link rel="stylesheet" href="/ui/assets/chunk.143.b1771c7664bb3778d56b.css"/>
<script src="/ui/assets/chunk.924.719761ac5e77d019056f.js"></script>
-<script src="/ui/assets/chunk.143.4a0b6b80ab1fa229e8ae.js"></script>
+<script src="/ui/assets/chunk.143.b1771c7664bb3778d56b.js"></script>
The UI will fail to load if the asset being requested by the browser is not present on the backend server.
A potential solution might be to modify the build workflow to build the UI once for a given release, and then use the same assets for all OS and architecture-specific builds, instead of building it for each OS/arch build as is done today.
I'll mark this as a bug and work with our team to figure out the best path forward.
Overview of the Issue
When having an autoscaling group that is configured to use both ARM64 and AMD64 nodes, consul WebUI can load bundles from different instances types, and this prevent the UI from loading correctly, as bundles are not named the same between architectures.
Reproduction Steps
Consul info for both Client and Server
Client info
``` Output from client 'consul info' command here ``` ``` Client agent HCL config ```Server info
``` gent: check_monitors = 0 check_ttls = 0 checks = 0 services = 0 build: prerelease = revision = 98cb473c version = 1.18.1 version_metadata = consul: acl = enabled bootstrap = false known_datacenters = 1 leader = false leader_addr = REDACTED:8300 server = true raft: applied_index = 1154473 commit_index = 1154473 fsm_pending = 0 last_contact = 59.436445ms last_log_index = 1154473 last_log_term = 26 last_snapshot_index = 1147419 last_snapshot_term = 26 latest_configuration = [REDACTED] latest_configuration_index = 0 num_peers = 2 protocol_version = 3 protocol_version_max = 3 protocol_version_min = 0 snapshot_version_max = 1 snapshot_version_min = 0 state = Follower term = 26 runtime: arch = amd64 cpu_count = 2 goroutines = 519 max_procs = 2 os = linux version = go1.21.8 serf_lan: coordinate_resets = 0 encrypted = true event_queue = 0 event_time = 25 failed = 0 health_score = 0 intent_queue = 0 left = 1 member_time = 214133 members = 15 query_queue = 0 query_time = 1 serf_wan: coordinate_resets = 0 encrypted = true event_queue = 0 event_time = 1 failed = 0 health_score = 0 intent_queue = 0 left = 1 member_time = 62897 members = 4 query_queue = 0 query_time = 1 ``` ``` server = true bind_addr = "{{ GetInterfaceIP \"ens5\" }}" advertise_addr = "{{ GetInterfaceIP \"ens5\" }}" client_addr = "0.0.0.0" data_dir = "/opt/consul" bootstrap_expect = 3 encrypt = "REDACTED" encrypt_verify_incoming = true encrypt_verify_outgoing = true leave_on_terminate = true acl = { enabled = true default_policy = "deny" down_policy = "extend-cache" enable_token_persistence = true } connect { enabled = true ca_provider = "consul" } ports = { http = 8500 # TCP only https = 8501 # TCP only grpc = 8502 # TCP only grpc_tls = 8503 # TCP only dns = 8600 # TCP and UDP server = 8300 # TCP only serf_lan = 8301 # TCP and UDP serf_wan = 8302 # TCP and UDP } node_name = REDACTED datacenter = "ops" primary_datacenter = "ops" retry_join = [REDACTED] node_meta { instance_purpose = "consul" ami_id = "REDACTED" instance_type = "t3.small" availability_zone = "eu-west-3b" } autopilot = { max_trailing_logs = 250 cleanup_dead_servers = true last_contact_threshold = "200ms" server_stabilization_time = "10s" } telemetry = { disable_hostname = true prometheus_retention_time = "60s" } ui_config = { enabled = true } auto_encrypt = { allow_tls = true } peering = { enabled = true } tls { defaults { verify_incoming = true verify_outgoing = true verify_server_hostname = true ca_file = "/opt/consul/tls/consul-ca.pem" key_file = "/opt/consul/tls/consul-key.pem" cert_file = "/opt/consul/tls/consul-cert.pem" } https { verify_incoming = false } } ```Operating system and Environment details
Server 1: OS: Amazon Linux 2023 Architecture: x86_64
Server 2: OS: Amazon Linux 2023 Architecture: AMD64
Log Fragments
Not applicable