Closed dani closed 5 days ago
One more thing I've just noticed: the issue appears when submiting a job in a specific nomad namespace (not default). There's no namespace on consul (as I'm using the community edition)
I've finally found where the issue comes from, it's not a bug, sorry for opening this. The problem comes from the default binding-rules created by noamd setup consul
c0c8a4d6-c0dd-58da-ac1c-832362ca3225:
AuthMethod: nomad-workloads
Description: Binding rule for Nomad tasks authenticated using a workload identity
BindType: role
BindName: nomad-${value.nomad_namespace}-tasks
Selector: "nomad_service" not in value
This binding-rule uses the nomad namespace to associate a role. But only the nomad-default-tasks role is created by default, so only the default nomad namespace can be used. Changing this binding-rule to hardcode nomad-default-tasks (instead of using ${value.nomad_namespace}) makes everything working
Nomad version
Nomad v1.8.4 BuildDate 2024-09-17T20:18:34Z Revision 22ab32e6cff66cf52f5e3f115b93de776bc09583
Operating system and Environment details
Everything is running AlmaLinux 9, with pre-built binaries for Nomad (1.8.4) and Consul (1.19.2). 3 servers running each nomad / consul / vault 6 clients, running each a noamd agent, and its local consul agent.
mTLS enabled, and ACL too. I'm switching from the old (pre 1.7) way of handling tokens to the new workload identities. No issue for vault, where workload identities are working fine, but consul is not working as expected
Issue
As soon as I enable consul workload identity, by adding this in nomad.hcl of the servers
Consul workload identities
``` service_identity { aud = [ "consul.io", ] ttl = "1h" } task_identity { aud = [ "consul.io", ] ttl = "1h" } ```No allocation can be started, it fails with
Reproduction steps
Consul server config
``` data_dir = "/opt/consul/data" bind_addr = "0.0.0.0" client_addr = "0.0.0.0" advertise_addr = "10.117.7.16" ports { dns = 8600 http = 8500 https = 8501 grpc = 8502 grpc_tls = 8503 serf_lan = 8301 serf_wan = 8302 server = 8300 sidecar_min_port = 21000 sidecar_max_port = 21255 expose_min_port = 21500 expose_max_port = 21755 } retry_join = [ "ct-poc-s-1.ehtrace.local", "ct-poc-s-2.ehtrace.local", "ct-poc-s-3.ehtrace.local", ] server = true bootstrap_expect = 3 performance { raft_multiplier = 1 } encrypt = "XXXXXX=" ui_config { enabled = true } recursors = [ "10.117.7.1", ] domain = "ct-poc.ehtrace.local" telemetry { prometheus_retention_time = "1h" disable_hostname = true } connect { enabled = true } acl { enabled = true default_policy = "deny" tokens { agent = "a864d4de-2dd9-2ac9-8b6a-xxxxxxxxxxxx" config_file_service_registration = "a864d4de-2dd9-2ac9-8b6a-xxxxxxxxxxxx" dns = "16a66b3e-4b50-9b29-74bc-xxxxxxxxxxxx" } } # TLS settings tls { defaults { ca_file = "/opt/consul/tls/ca.crt" cert_file = "/opt/consul/tls/consul.crt" key_file = "/opt/consul/tls/consul.key" verify_incoming = true verify_outgoing = true } internal_rpc { verify_server_hostname = true } } # auto_encrypt, to distribute certificates from servers to clients auto_encrypt { allow_tls = true } limits { http_max_conns_per_client = 400 } ```Consul agent config
``` data_dir = "/opt/consul/data" bind_addr = "0.0.0.0" client_addr = "0.0.0.0" advertise_addr = "10.117.7.21" ports { dns = 8600 http = 8500 https = 8501 grpc = 8502 grpc_tls = 8503 serf_lan = 8301 serf_wan = 8302 server = 8300 sidecar_min_port = 21000 sidecar_max_port = 21255 expose_min_port = 21500 expose_max_port = 21755 } retry_join = [ "ct-poc-s-1.ehtrace.local", "ct-poc-s-2.ehtrace.local", "ct-poc-s-3.ehtrace.local", ] encrypt = "XXXXX=" ui_config { enabled = false } recursors = [ "10.117.7.1", ] alt_domain = "ct-poc.ehtrace.local" telemetry { prometheus_retention_time = "1h" disable_hostname = true } acl { enabled = true default_policy = "deny" tokens { agent = "81d8e631-5789-8b81-85f0-cc5924a5b94c" } } auto_encrypt { tls = true } limits { http_max_conns_per_client = 800 } ```Nomad agent config
``` data_dir = "/opt/nomad/data" plugin_dir = "/opt/nomad/plugins" bind_addr = "0.0.0.0" disable_update_check = true advertise { } ports { http = 4646 rpc = 4647 serf = 4648 } acl { enabled = true } server { enabled = false bootstrap_expect = 3 server_join { retry_join = [ "ct-poc-s-1.ehtrace.local", "ct-poc-s-2.ehtrace.local", "ct-poc-s-3.ehtrace.local", ] } default_scheduler_config { scheduler_algorithm = "spread" memory_oversubscription_enabled = true preemption_config { batch_scheduler_enabled = true system_scheduler_enabled = true sysbatch_scheduler_enabled = true service_scheduler_enabled = true } } } client { enabled = true servers = [ "ct-poc-s-1.ehtrace.local", "ct-poc-s-2.ehtrace.local", "ct-poc-s-3.ehtrace.local", ] drain_on_shutdown { deadline = "1h" force = false ignore_system_jobs = false } host_volume "nomad_alloc" { path = "/opt/nomad/data/alloc" } host_volume "vector_data" { path = "/opt/nomad/data/vector" } host_volume "host_root" { path = "/" } host_volume "host_run_udev" { path = "/run/udev" read_only = "true" } reserved { cpu = 200 memory = 318 disk = 500 reserved_ports = "" } meta { } options { "driver.allowlist" = "exec,docker" } max_kill_timeout = "300s" } plugin "docker" { config { allow_privileged = true auth { config = "/opt/nomad/docker/auth.json" } disable_log_collection = true logging { type = "fluentd" config { fluentd-address = "127.0.0.1:4224" fluentd-async = true fluentd-async-reconnect-interval = "1s" fluentd-buffer-limit = 2097152 env = "NOMAD_JOB_NAME,NOMAD_GROUP_NAME,NOMAD_DC,NOMAD_REGION,NOMAD_TASK_NAME,NOMAD_ALLOC_INDEX,NOMAD_ALLOC_ID,NOMAD_NAMESPACE" } } extra_labels = [ "job_name", "task_group_name", "task_name", "namespace", "node_name", "node_id", ] gc { image_delay = "12h" } } } plugin "raw_exec" { config { enabled = false } } plugin "containerd-driver" { config { enabled = false containerd_runtime = "io.containerd.runc.v2" allow_privileged = true } } ui { enabled = false } telemetry { prometheus_metrics = true disable_hostname = true publish_allocation_metrics = true publish_node_metrics = true } consul { } vault { enabled = true address = "https://active.vault.service.ct-poc.ehtrace.local:8200" } tls { ca_file = "/opt/nomad/tls/ca.crt" cert_file = "/opt/nomad/tls/nomad.crt" key_file = "/opt/nomad/tls/nomad.key" http = true rpc = true rpc_upgrade_mode = false verify_https_client = true verify_server_hostname = true } ```Each Nomad also gets a consul token (through the CONSUL_HTTP_TOKEN var) with the policy
In the packet capture, the nomad agent had consul token
be476ba7-6473-e734-6f95-53f672d69e37
Expected Result
Allocation can get there consul token and start
Actual Result
Allocation fails
Job file (if appropriate)
Any job fails 100% of the time. Here're the logs of the consul leader while trying to start a simple allocation
server_consul.txt
And here're the logs of the agent (noamd + consul logs) agent_consul_nomad.txt
Here's a packet capture on lo tcp port 8500 on the node trying to start the allocation (so the nomad agent -> consul agent trafic) nomad_consul.pcap.zip
Additional info