hashicorp / nomad

Nomad is an easy-to-use, flexible, and performant workload orchestrator that can deploy a mix of microservice, batch, containerized, and non-containerized applications. Nomad is easy to operate and scale and has native Consul and Vault integrations.
https://www.nomadproject.io/
Other
14.94k stars 1.96k forks source link

raw_exec driver: "unable to configure cgroups: no such file or directory" #23595

Closed djthorpe closed 3 months ago

djthorpe commented 4 months ago

Nomad version

1.8.1

Operating system and Environment details

Debian Linux on ARM

Issue

"unable to configure cgroups: no such file or directory" for a task with a raw_exec driver

Reproduction steps

Here is the task:

 task "init" {
      driver = "raw_exec"

      lifecycle {
        sidecar = false
        hook    = "prestart"
      }

      config {
        // Set permissions on the directory
        command = var.data == "" ? "/usr/bin/echo" : "/usr/bin/install"
        args = compact([
          "-d", var.data,
          "-o", "472"
        ])
      }
    }

The full message I am receiving is:

Jul 15, '24 13:00:39 +0200 | Driver Failure | failed to launch command with executor: rpc error: code = Unknown desc = unable to configure cgroups: no such file or directory

Expected Result

The init task end successfully

Actual Result

The init task prevents the main task from starting

Potentially related issue: https://github.com/hashicorp/nomad/issues/23250

tgross commented 3 months ago

Hi @djthorpe! A couple of questions:

djthorpe commented 3 months ago

Are you running the Nomad client as root? Yes - attached is a sample client config file

Are you running Nomad in a custom cgroup? No - attatched is the systemd service file

Can you provide the logs for the executor process? Yes - attached!

Also attached is the nomad job file for grafana.

datacenter = "10967"
region = "de"
data_dir = "/opt/hashicorp/var/nomad"
bind_addr = "0.0.0.0"
name = "cm3"

advertise  {
  http = "100.99.168.77"
  rpc = "100.99.168.77"
  serf = "100.99.168.77"
}

plugin "raw_exec" {
  config {
    enabled = true
  }
}

plugin "docker" {
  config {
    allow_privileged      = true
    pull_activity_timeout = "10m"
    volumes {
      enabled = true
    }
  }
}

client {
  enabled = true
  servers = [ "cm1.tailnet-db1f.ts.net" ]
  options = {
    "driver.denylist" = "java, exec, qemu"
  }
}

acl {
  enabled = true
}

ui {
  enabled = false
}
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/docs/
Wants=network-online.target
After=network-online.target

[Service]
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/opt/hashicorp/bin/nomad agent -config /opt/hashicorp/etc/nomad
KillMode=process
KillSignal=SIGINT
LimitNOFILE=65536
LimitNPROC=infinity
Restart=on-failure
RestartSec=2
TasksMax=infinity
OOMScoreAdjust=-1000

[Install]
WantedBy=multi-user.target

// grafana dashboard software
// Docker Image: grafana/grafana

///////////////////////////////////////////////////////////////////////////////
// VARIABLES

variable "dc" {
  description = "data centers that the job is eligible to run in"
  type        = list(string)
}

variable "namespace" {
  description = "namespace that the job runs in"
  type        = string
  default     = "default"
}

variable "hosts" {
  description = "host constraint for the job, defaults to one host"
  type        = list(string)
  default     = []
}

variable "service_provider" {
  description = "Service provider, either consul or nomad"
  type        = string
  default     = "nomad"
}

variable "service_name" {
  description = "Service name"
  type        = string
  default     = "grafana-http"
}

variable "service_dns" {
  description = "Service discovery DNS"
  type        = list(string)
  default     = []
}

variable "service_type" {
  description = "Run as a service or system"
  type        = string
  default     = "service"
}

variable "docker_image" {
  description = "Docker image"
  type        = string
}

variable "docker_always_pull" {
  description = "Pull docker image on every job restart"
  type        = bool
  default     = false
}

///////////////////////////////////////////////////////////////////////////////

variable "port" {
  description = "Port for plaintext connections"
  type        = number
  default     = 3000
}

variable "data" {
  description = "Data persistence directory"
  type        = string
  default     = ""
}

variable "admin_user" {
  description = "Name for 'admin' user (optional)"
  type        = string
  default     = "admin"
}

variable "admin_password" {
  description = "Password for 'admin' user (required)"
  type        = string
}

variable "admin_email" {
  description = "Email for 'admin' user"
  type        = string
  default     = ""
}

variable "anonymous_enabled" {
  description = "Allow anonymous access"
  type        = bool
  default     = false
}

variable "anonymous_org" {
  description = "Organization name that should be used for unauthenticated users"
  type        = string
  default     = ""
}

variable "anonymous_role" {
  description = "Role for unauthenticated users"
  type        = string
  default     = "Viewer"
}

variable "database" {
  description = "Database connection parameters"
  type        = object({ type = string, host = string, port = number, name = string, user = string, password = string, ssl_mode = string })
  default     = { type : "", host : "", port : 0, name : "", user : "", password : "", ssl_mode : "" }
}

variable "url" {
  description = "URL used for serving the application"
  type        = string
  default     = ""
}

///////////////////////////////////////////////////////////////////////////////
// LOCALS

locals {
  logs_path         = "${NOMAD_ALLOC_DIR}/logs"
  db_path           = var.data == "" ? "${NOMAD_ALLOC_DIR}/data/db" : "/var/lib/grafana/data"
  plugins_path      = var.data == "" ? "${NOMAD_ALLOC_DIR}/data/plugins" : "/var/lib/grafana/plugins"
  provisioning_path = var.data == "" ? "${NOMAD_ALLOC_DIR}/data/provisioning" : "/var/lib/grafana/provisioning"
  db_host           = var.database.host == "" ? "" : format("%s:%d", var.database.host, var.database.port == 0 ? 5432 : var.database.port)
}

///////////////////////////////////////////////////////////////////////////////
// JOB

job "grafana" {
  type        = var.service_type
  datacenters = var.dc
  namespace   = var.namespace

  update {
    min_healthy_time = "10s"
    healthy_deadline = "5m"
    health_check     = "task_states"
  }

  /////////////////////////////////////////////////////////////////////////////////

  group "grafana" {
    count = (length(var.hosts) == 0 || var.service_type == "system") ? 1 : length(var.hosts)

    dynamic "constraint" {
      for_each = length(var.hosts) == 0 ? [] : [join(",", var.hosts)]
      content {
        attribute = node.unique.name
        operator  = "set_contains_any"
        value     = constraint.value
      }
    }

    network {
      port "http" {
        static = var.port
        to     = 3000
      }
    }

    service {
      tags     = ["grafana", "http"]
      name     = "grafana-http"
      port     = "http"
      provider = var.service_provider
    }

    ephemeral_disk {
      migrate = true
    }

    task "init" {
      driver = "raw_exec"

      lifecycle {
        sidecar = false
        hook    = "prestart"
      }

      config {
        // Set permissions on the directory
        command = var.data == "" ? "/usr/bin/echo" : "/usr/bin/install"
        args = compact([
          "-d", var.data,
          "-o", "472"
        ])
      }
    } // task "init"

    task "daemon" {
      driver = "docker"

      env {
        GF_PATHS_LOGS              = local.logs_path
        GF_PATHS_DATA              = local.db_path
        GF_PATHS_PLUGINS           = local.plugins_path
        GF_PATHS_PROVISIONING      = local.provisioning_path
        GF_SECURITY_ADMIN_USER     = var.admin_user
        GF_SECURITY_ADMIN_PASSWORD = var.admin_password
        GF_SECURITY_ADMIN_EMAIL    = var.admin_email
        GF_AUTH_ANONYMOUS_ENABLED  = var.anonymous_enabled
        GF_AUTH_ANONYMOUS_ORG_NAME = var.anonymous_org
        GF_AUTH_ANONYMOUS_ORG_ROLE = var.anonymous_role
        GF_AUTH_HIDE_VERSION       = true
        GF_DATABASE_TYPE           = var.database.type
        GF_DATABASE_HOST           = local.db_host
        GF_DATABASE_NAME           = var.database.name
        GF_DATABASE_USER           = var.database.user
        GF_DATABASE_PASSWORD       = var.database.password
        GF_DATABASE_SSL_MODE       = var.database.ssl_mode
        GF_SERVER_ROOT_URL         = var.url
      }

      config {
        image       = var.docker_image
        force_pull  = var.docker_always_pull
        ports       = ["http"]
        dns_servers = var.service_dns
        volumes = compact([
          var.data == "" ? "" : format("%s:/var/lib/grafana", var.data)
        ])
      }

    } // task "daemon"
  }   // group "grafana"
}     // job "grafana"
{"@level":"debug","@message":"plugin address","@timestamp":"2024-08-07T10:41:11.126903+02:00","address":"/tmp/plugin813365042","network":"unix"}
{"@level":"error","@message":"failed to configure container, process isolation will not work","@module":"executor","@timestamp":"2024-08-07T10:41:11.132376+02:00","error":"no such file or directory"}
tgross commented 3 months ago

Hi @djthorpe! I've had a look through the relevant sections of the code and as far as I can tell it looks like your host may be missing some of the cgroups that Nomad needs to properly isolate processes and shutdown groups of raw_exec processes cleanly.

Depending on whether you're on cgroups v1 or v2, you might be missing the pid or freezer cgroup. Take a look at the cgroup controllers requirements docs which has both how to diagnose what you might be missing and how to fix it.

djthorpe commented 3 months ago

Hi @tgross I appreciate you looking into this. I'll take a look at your docs, but just to note I believe the regression occurred upgrading from (I think) Nomad 1.6 and 1.7 series on various nodes. Happy for you to close this issue.

tgross commented 3 months ago

Ok, I'm going to close this out. If you find that you do have all the relevant cgroup controllers, we'll be happy to re-open.