hashicorp / nomad

Nomad is an easy-to-use, flexible, and performant workload orchestrator that can deploy a mix of microservice, batch, containerized, and non-containerized applications. Nomad is easy to operate and scale and has native Consul and Vault integrations.
https://www.nomadproject.io/
Other
14.76k stars 1.94k forks source link

restarting server and client results in failing Consul health checks #16453

Closed suikast42 closed 1 year ago

suikast42 commented 1 year ago

Nomad version

Nomad v1.5.1 BuildDate 2023-03-10T22:05:57Z Revision 6c118ddaf95e9bfd3d85f488f255fdc5e14129c7

Consul version

Consul v1.15.1 Revision 7c04b6a0 Build Date 2023-03-07T20:35:33Z

Operating system and Environment details

Ubuntu 22.04 In VMWare Worksation Single Node Master Single Node Worker

Issue

If I deloy the job below at first time everything works fine. If I reboot only the worker machine this works fine as well.

But If I (re)boot the master and server then the Nomoad UI shows a helathy deloyment but consul shows failed healthcheck.

A restart of the worker deploys the minio job as expected again.

Actual Result

image

image

Job file (if appropriate)

job "minio" {
  datacenters = ["nomadder1"]
  type = "service"
  reschedule {
    delay          = "10s"
    delay_function = "constant"
    unlimited      = true
  }
  update {
      max_parallel      = 1
      health_check      = "checks"
      # Alloc is marked as unhealthy after this time
      healthy_deadline  = "2m"
      min_healthy_time  = "10s"
  }
  group "minio" {

    count = 1
    volume "stack_core_minio_volume" {
      type      = "host"
      source    = "core_minio_volume"
      read_only = false
    }
   volume "ca_cert" {
      type      = "host"
      source    = "ca_cert"
      read_only = true
    }
    restart {
      attempts = 10
      interval = "2m"
      delay = "5s"
      mode = "fail"
    }
    network {
      mode = "bridge"
      port "http" {
        to = 9000
      }
      port "console" {
        to = 9001
      }
    }

    task "minio" {
      volume_mount {
         volume      = "stack_core_minio_volume"
         destination = "/data"
      }
      volume_mount {
        volume      = "ca_cert"
        # the server searches in the /CAs path at that specified directory.
        # Do not change the sub folder name CAs
        destination = "/certs/CAs"
      }
      driver = "docker"

      config {
        image = "registry.cloud.private/minio/minio:RELEASE.2023-03-09T23-16-13Z"
        command = "server"
        args = [
          "/data",
          "--console-address",
          ":9001",
          "--certs-dir",
          "/certs"
          ]
         ports = ["http","console"]
      }

     env {
        HOSTNAME = "${NOMAD_ALLOC_NAME}"
        MINIO_SERVER_URL = "https://minio.cloud.private"
        MINIO_PROMETHEUS_AUTH_TYPE = "public"
        MINIO_PROMETHEUS_URL     = "http://mimir.service.consul:9009/prometheus"
        MINIO_PROMETHEUS_JOB_ID  = "integrations/minio"

      }
    template {
       destination = "${NOMAD_SECRETS_DIR}/env.vars"
       env         = true
       change_mode = "restart"
       data        = <<EOF
       {{- with nomadVar "nomad/jobs/minio" -}}
          MINIO_ROOT_USER      = {{.minio_root_user}}
          MINIO_ROOT_PASSWORD  =  {{.minio_root_password}}
        {{- end -}}
       EOF
    }

      resources {
        cpu= 500
        memory = 512
        memory_max = 4096
      }

      service {
        port = "http"
        name = "minio"
        tags = [
         "prometheus_minio",
         "frontend",
         "minio",
         "prometheus:server=${NOMAD_ALLOC_NAME}",
         "prometheus:version=RELEASE.2023-03-09T23-16-13Z",
         "traefik.enable=true",
         "traefik.consulcatalog.connect=false",
         "traefik.http.routers.minio.tls=true",
         "traefik.http.routers.minio.rule=Host(`minio.cloud.private`)",
        ]

      check {
        name      = "minio-live"
        type     = "http"
        path     = "/minio/health/live"
        port     = "http"
        interval = "10s"
        timeout  = "2s"
      }
      check {
          name      = "minio-ready"
          type     = "http"
          path     = "/minio/health/ready"
          port     = "http"
          interval = "10s"
          timeout  = "4s"
          check_restart {
            limit = 3
            grace = "60s"
            ignore_warnings = false
          }
       }
     }
      service {
           port = "console"
           name = "minio-console"
           tags = [
             "console",
             "minio",
             "traefik.enable=true",
             "traefik.consulcatalog.connect=false",
             "traefik.http.routers.minio-console.tls=true",
             "traefik.http.routers.minio-console.rule=Host(`minio.console.cloud.private`)",
           ]
           check {
             type     = "http"
             path     = "/"
             port     = "console"
             interval = "30s"
             timeout  = "2s"
           }
         }
      }
  }
}

Nomad Server logs (if appropriate)

hod=GET path=/v1/job/minio/allocations?index=2651 duration=8.98017897s Mar 13 17:15:35 master-01 nomad[862]: 2023-03-13T17:15:35.724Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=2.135934ms Mar 13 17:15:36 master-01 nomad[862]: 2023-03-13T17:15:36.361Z [DEBUG] http: request complete: method=GET path=/v1/job/minio/allocations?index=2654 duration=646.380027ms Mar 13 17:15:36 master-01 nomad[862]: 2023-03-13T17:15:36.361Z [DEBUG] http: request complete: method=GET path=/v1/job/minio/summary?index=2651 duration=2.973866356s Mar 13 17:15:36 master-01 nomad[862]: 2023-03-13T17:15:36.361Z [DEBUG] http: request complete: method=GET path=/v1/job/minio/summary?index=2651 duration=9.604144736s Mar 13 17:15:39 master-01 nomad[862]: 2023-03-13T17:15:39.584Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:56384 Mar 13 17:15:39 master-01 nomad[862]: 2023-03-13T17:15:39.677Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.050135ms Mar 13 17:15:41 master-01 nomad[862]: 2023-03-13T17:15:41.985Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=2.096954ms Mar 13 17:15:43 master-01 nomad[862]: 2023-03-13T17:15:43.676Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.192653ms Mar 13 17:15:47 master-01 nomad[862]: 2023-03-13T17:15:47.678Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.810706ms Mar 13 17:15:49 master-01 nomad[862]: 2023-03-13T17:15:49.585Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:52056 Mar 13 17:15:51 master-01 nomad[862]: 2023-03-13T17:15:51.674Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.126754ms Mar 13 17:15:51 master-01 nomad[862]: 2023-03-13T17:15:51.985Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.35715ms Mar 13 17:15:55 master-01 nomad[862]: 2023-03-13T17:15:55.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.214397ms Mar 13 17:15:59 master-01 nomad[862]: 2023-03-13T17:15:59.588Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:52266 Mar 13 17:15:59 master-01 nomad[862]: 2023-03-13T17:15:59.674Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.349902ms Mar 13 17:16:01 master-01 nomad[862]: 2023-03-13T17:16:01.985Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.264898ms Mar 13 17:16:03 master-01 nomad[862]: 2023-03-13T17:16:03.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.765445ms Mar 13 17:16:07 master-01 nomad[862]: 2023-03-13T17:16:07.674Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.560067ms Mar 13 17:16:09 master-01 nomad[862]: 2023-03-13T17:16:09.590Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:35512 Mar 13 17:16:11 master-01 nomad[862]: 2023-03-13T17:16:11.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.72123ms Mar 13 17:16:11 master-01 nomad[862]: 2023-03-13T17:16:11.990Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=4.115019ms Mar 13 17:16:15 master-01 nomad[862]: 2023-03-13T17:16:15.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.512785ms

Nomad Client logs (if appropriate)

Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.196Z [DEBUG] client.driver_mgr.docker.docker_logger: starting plugin: driver=docker path=/usr/local/bin/nomad args=["/usr/local/bin/nomad", "docker_logger"] Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.196Z [DEBUG] client.driver_mgr.docker.docker_logger: plugin started: driver=docker path=/usr/local/bin/nomad pid=15936 Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.196Z [DEBUG] client.driver_mgr.docker.docker_logger: waiting for RPC address: driver=docker path=/usr/local/bin/nomad Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.221Z [DEBUG] client.driver_mgr.docker.docker_logger: using plugin: driver=docker version=2 Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.221Z [DEBUG] client.driver_mgr.docker.docker_logger.nomad: plugin address: driver=docker address=/tmp/plugin4213717586 network=unix @module=docker_logger timestamp=2023-03-13T17:15:42.221Z Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.222Z [DEBUG] client.driver_mgr.docker.docker_logger.nomad: using client connection initialized from environment: driver=docker @module=docker_logger timestamp=2023-03-13T17:15:42.222Z Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.223Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=8306cc25-592b-9230-ee60-5d973045bac2 task=minio type=Started msg="Task started by client" failed=false Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.240Z [DEBUG] consul.sync: sync complete: registered_services=2 deregistered_services=0 registered_checks=0 deregistered_checks=0 Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.422Z [DEBUG] client: updated allocations: index=2656 total=24 pulled=3 filtered=21 Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.422Z [DEBUG] client: allocation updates: added=0 removed=0 updated=3 ignored=21 Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.425Z [DEBUG] client: allocation updates applied: added=0 removed=0 updated=3 ignored=21 errors=0 Mar 13 17:15:46 worker-01 nomad[927]: 2023-03-13T17:15:46.507Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.675172ms Mar 13 17:15:56 worker-01 nomad[927]: 2023-03-13T17:15:56.504Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.059369ms Mar 13 17:16:06 worker-01 nomad[927]: 2023-03-13T17:16:06.311Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/minio?index=1886&namespace=default&stale=&wait=60000ms" duration=1m0.84551494s Mar 13 17:16:06 worker-01 nomad[927]: 2023-03-13T17:16:06.505Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.112804ms Mar 13 17:16:16 worker-01 nomad[927]: 2023-03-13T17:16:16.505Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration="988.982µs" Mar 13 17:16:16 worker-01 nomad[927]: 2023-03-13T17:16:16.723Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/security?index=1885&namespace=default&stale=&wait=60000ms" duration=1m3.404017411s Mar 13 17:16:19 worker-01 nomad[927]: 2023-03-13T17:16:19.944Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/security?index=1885&namespace=default&stale=&wait=60000ms" duration=1m0.406644411s Mar 13 17:16:26 worker-01 nomad[927]: 2023-03-13T17:16:26.508Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.542904ms Mar 13 17:16:31 worker-01 nomad[927]: 2023-03-13T17:16:31.131Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/security?index=1885&namespace=default&stale=&wait=60000ms" duration=1m0.295490132s Mar 13 17:16:36 worker-01 nomad[927]: 2023-03-13T17:16:36.506Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.148015ms Mar 13 17:16:42 worker-01 nomad[927]: 2023-03-13T17:16:42.711Z [DEBUG] watch.checks: check became unhealthy. Will restart if check doesn't become healthy: alloc_id=8306cc25-592b-9230-ee60-5d973045bac2 check=minio-ready task=minio time_limit=20s Mar 13 17:16:46 worker-01 nomad[927]: 2023-03-13T17:16:46.506Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.506482ms

suikast42 commented 1 year ago

That's strange.

If I shutdown the master and worker then boot only the master nomad shows me the alloxations as running image

tgross commented 1 year ago

Hi @suikast42! Here's the relevant bit from your logs:

Mar 13 17:16:42 worker-01 nomad[927]: 2023-03-13T17:16:42.711Z [DEBUG] watch.checks: check became unhealthy. Will restart if check doesn't become healthy: alloc_id=8306cc25-592b-9230-ee60-5d973045bac2 check=minio-ready task=minio time_limit=20s

The first thing I'd look at in this case is to see if the health check should be passing. Just because the client's handle to the allocation has been restored doesn't mean that it's actually able to serve the health check. You should check:

It's also a little unclear as to whether you've restarted the client agent or the entire host the client is on. If you reboot the host, the allocation will not restore during the Nomad client start, and the client will have to get rescheduled allocation from the server. That might explain why restarting the server caused the problem.

If I shutdown the master and worker then boot only the master nomad shows me the alloxations as running

The server will wait until the client misses a heartbeat before marking the client as down and then the allocation will be marked lost. The amount of time this takes varies and depends on the configuration and size of the cluster. See the Client Heartbeats documentation.

suikast42 commented 1 year ago

I redefine the reschedule, restart , update configs as folowing and not have the described issue anymore.

job "minio" {
  datacenters = ["nomadder1"]
  type = "service"
   reschedule {
    delay          = "10s"
    delay_function = "constant"
    unlimited      = true
  }
  update {
      max_parallel      = 1
      health_check      = "checks"
      # Alloc is marked as unhealthy after this time
      healthy_deadline  = "5m"
      auto_revert  = true
      # Mark the task as healthy after 10s positive check
      min_healthy_time  = "10s"
      # Task is dead after failed checks in 1h
      progress_deadline = "1h"
  }
  group "minio" {

    count = 1
    volume "stack_core_minio_volume" {
      type      = "host"
      source    = "core_minio_volume"
      read_only = false
    }
   volume "ca_cert" {
      type      = "host"
      source    = "ca_cert"
      read_only = true
    }
    restart {
      attempts = 1
      interval = "1h"
      delay = "5s"
      mode = "fail"
    }
    network {
      mode = "bridge"
      port "http" {
        to = 9000
      }
      port "console" {
        to = 9001
      }
    }

    task "minio" {
      volume_mount {
         volume      = "stack_core_minio_volume"
         destination = "/data"
      }
      volume_mount {
        volume      = "ca_cert"
        # the server searches in the /CAs path at that specified directory.
        # Do not change the sub folder name CAs
        destination = "/certs/CAs"
      }
      driver = "docker"

      config {
        image = "registry.cloud.private/minio/minio:RELEASE.2023-03-09T23-16-13Z"
        command = "server"
        args = [
          "/data",
          "--console-address",
          ":9001",
          "--certs-dir",
          "/certs"
          ]
         ports = ["http","console"]
      }

     env {
        HOSTNAME = "${NOMAD_ALLOC_NAME}"
        MINIO_SERVER_URL = "https://minio.cloud.private"
        #MINIO_IDENTITY_OPENID_CONFIG_URL="https://security.cloud.private/realms/nomadder/.well-known/openid-configuration"
        MINIO_PROMETHEUS_AUTH_TYPE = "public"
        MINIO_PROMETHEUS_URL     = "http://mimir.service.consul:9009/prometheus"
        MINIO_PROMETHEUS_JOB_ID  = "integrations/minio"
      }
    template {
       destination = "${NOMAD_SECRETS_DIR}/env.vars"
       env         = true
       change_mode = "restart"
       data        = <<EOF
       {{- with nomadVar "nomad/jobs/minio" -}}
          MINIO_ROOT_USER      = {{.minio_root_user}}
          MINIO_ROOT_PASSWORD  =  {{.minio_root_password}}
        {{- end -}}
       EOF
    }

      resources {
        cpu= 500
        memory = 512
        memory_max = 4096
      }

      service {
        port = "http"
        name = "minio"
        tags = [
         "prometheus_minio",
         "frontend",
         "minio",
         "prometheus:server=${NOMAD_ALLOC_NAME}",
         "prometheus:version=RELEASE.2023-03-09T23-16-13Z",
         "traefik.enable=true",
         "traefik.consulcatalog.connect=false",
         "traefik.http.routers.minio.tls=true",
         "traefik.http.routers.minio.rule=Host(`minio.cloud.private`)",
        ]

      check {
        name      = "minio-live"
        type     = "http"
        path     = "/minio/health/live"
        port     = "http"
        interval = "10s"
        timeout  = "2s"
      }
      check {
          name      = "minio-ready"
          type     = "http"
          path     = "/minio/health/ready"
          port     = "http"
          interval = "10s"
          timeout  = "2s"
          check_restart {
            limit = 3
            grace = "60s"
            ignore_warnings = false
          }
       }
     }
      service {
           port = "console"
           name = "minio-console"
           tags = [
             "console",
             "minio",
             "traefik.enable=true",
             "traefik.consulcatalog.connect=false",
             "traefik.http.routers.minio-console.tls=true",
             "traefik.http.routers.minio-console.rule=Host(`minio.console.cloud.private`)",
           ]
           check {
             type     = "http"
             path     = "/"
             port     = "console"
             interval = "10s"
             timeout  = "2s"
             check_restart {
               limit = 3
               grace = "60s"
               ignore_warnings = false
             }
           }
         }
      }
  }
}