Closed djthorpe closed 3 months ago
Hi @djthorpe! A couple of questions:
Are you running the Nomad client as root? Yes - attached is a sample client config file
Are you running Nomad in a custom cgroup? No - attatched is the systemd service file
Can you provide the logs for the executor process? Yes - attached!
Also attached is the nomad job file for grafana.
datacenter = "10967"
region = "de"
data_dir = "/opt/hashicorp/var/nomad"
bind_addr = "0.0.0.0"
name = "cm3"
advertise {
http = "100.99.168.77"
rpc = "100.99.168.77"
serf = "100.99.168.77"
}
plugin "raw_exec" {
config {
enabled = true
}
}
plugin "docker" {
config {
allow_privileged = true
pull_activity_timeout = "10m"
volumes {
enabled = true
}
}
}
client {
enabled = true
servers = [ "cm1.tailnet-db1f.ts.net" ]
options = {
"driver.denylist" = "java, exec, qemu"
}
}
acl {
enabled = true
}
ui {
enabled = false
}
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/docs/
Wants=network-online.target
After=network-online.target
[Service]
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/opt/hashicorp/bin/nomad agent -config /opt/hashicorp/etc/nomad
KillMode=process
KillSignal=SIGINT
LimitNOFILE=65536
LimitNPROC=infinity
Restart=on-failure
RestartSec=2
TasksMax=infinity
OOMScoreAdjust=-1000
[Install]
WantedBy=multi-user.target
// grafana dashboard software
// Docker Image: grafana/grafana
///////////////////////////////////////////////////////////////////////////////
// VARIABLES
variable "dc" {
description = "data centers that the job is eligible to run in"
type = list(string)
}
variable "namespace" {
description = "namespace that the job runs in"
type = string
default = "default"
}
variable "hosts" {
description = "host constraint for the job, defaults to one host"
type = list(string)
default = []
}
variable "service_provider" {
description = "Service provider, either consul or nomad"
type = string
default = "nomad"
}
variable "service_name" {
description = "Service name"
type = string
default = "grafana-http"
}
variable "service_dns" {
description = "Service discovery DNS"
type = list(string)
default = []
}
variable "service_type" {
description = "Run as a service or system"
type = string
default = "service"
}
variable "docker_image" {
description = "Docker image"
type = string
}
variable "docker_always_pull" {
description = "Pull docker image on every job restart"
type = bool
default = false
}
///////////////////////////////////////////////////////////////////////////////
variable "port" {
description = "Port for plaintext connections"
type = number
default = 3000
}
variable "data" {
description = "Data persistence directory"
type = string
default = ""
}
variable "admin_user" {
description = "Name for 'admin' user (optional)"
type = string
default = "admin"
}
variable "admin_password" {
description = "Password for 'admin' user (required)"
type = string
}
variable "admin_email" {
description = "Email for 'admin' user"
type = string
default = ""
}
variable "anonymous_enabled" {
description = "Allow anonymous access"
type = bool
default = false
}
variable "anonymous_org" {
description = "Organization name that should be used for unauthenticated users"
type = string
default = ""
}
variable "anonymous_role" {
description = "Role for unauthenticated users"
type = string
default = "Viewer"
}
variable "database" {
description = "Database connection parameters"
type = object({ type = string, host = string, port = number, name = string, user = string, password = string, ssl_mode = string })
default = { type : "", host : "", port : 0, name : "", user : "", password : "", ssl_mode : "" }
}
variable "url" {
description = "URL used for serving the application"
type = string
default = ""
}
///////////////////////////////////////////////////////////////////////////////
// LOCALS
locals {
logs_path = "${NOMAD_ALLOC_DIR}/logs"
db_path = var.data == "" ? "${NOMAD_ALLOC_DIR}/data/db" : "/var/lib/grafana/data"
plugins_path = var.data == "" ? "${NOMAD_ALLOC_DIR}/data/plugins" : "/var/lib/grafana/plugins"
provisioning_path = var.data == "" ? "${NOMAD_ALLOC_DIR}/data/provisioning" : "/var/lib/grafana/provisioning"
db_host = var.database.host == "" ? "" : format("%s:%d", var.database.host, var.database.port == 0 ? 5432 : var.database.port)
}
///////////////////////////////////////////////////////////////////////////////
// JOB
job "grafana" {
type = var.service_type
datacenters = var.dc
namespace = var.namespace
update {
min_healthy_time = "10s"
healthy_deadline = "5m"
health_check = "task_states"
}
/////////////////////////////////////////////////////////////////////////////////
group "grafana" {
count = (length(var.hosts) == 0 || var.service_type == "system") ? 1 : length(var.hosts)
dynamic "constraint" {
for_each = length(var.hosts) == 0 ? [] : [join(",", var.hosts)]
content {
attribute = node.unique.name
operator = "set_contains_any"
value = constraint.value
}
}
network {
port "http" {
static = var.port
to = 3000
}
}
service {
tags = ["grafana", "http"]
name = "grafana-http"
port = "http"
provider = var.service_provider
}
ephemeral_disk {
migrate = true
}
task "init" {
driver = "raw_exec"
lifecycle {
sidecar = false
hook = "prestart"
}
config {
// Set permissions on the directory
command = var.data == "" ? "/usr/bin/echo" : "/usr/bin/install"
args = compact([
"-d", var.data,
"-o", "472"
])
}
} // task "init"
task "daemon" {
driver = "docker"
env {
GF_PATHS_LOGS = local.logs_path
GF_PATHS_DATA = local.db_path
GF_PATHS_PLUGINS = local.plugins_path
GF_PATHS_PROVISIONING = local.provisioning_path
GF_SECURITY_ADMIN_USER = var.admin_user
GF_SECURITY_ADMIN_PASSWORD = var.admin_password
GF_SECURITY_ADMIN_EMAIL = var.admin_email
GF_AUTH_ANONYMOUS_ENABLED = var.anonymous_enabled
GF_AUTH_ANONYMOUS_ORG_NAME = var.anonymous_org
GF_AUTH_ANONYMOUS_ORG_ROLE = var.anonymous_role
GF_AUTH_HIDE_VERSION = true
GF_DATABASE_TYPE = var.database.type
GF_DATABASE_HOST = local.db_host
GF_DATABASE_NAME = var.database.name
GF_DATABASE_USER = var.database.user
GF_DATABASE_PASSWORD = var.database.password
GF_DATABASE_SSL_MODE = var.database.ssl_mode
GF_SERVER_ROOT_URL = var.url
}
config {
image = var.docker_image
force_pull = var.docker_always_pull
ports = ["http"]
dns_servers = var.service_dns
volumes = compact([
var.data == "" ? "" : format("%s:/var/lib/grafana", var.data)
])
}
} // task "daemon"
} // group "grafana"
} // job "grafana"
{"@level":"debug","@message":"plugin address","@timestamp":"2024-08-07T10:41:11.126903+02:00","address":"/tmp/plugin813365042","network":"unix"}
{"@level":"error","@message":"failed to configure container, process isolation will not work","@module":"executor","@timestamp":"2024-08-07T10:41:11.132376+02:00","error":"no such file or directory"}
Hi @djthorpe! I've had a look through the relevant sections of the code and as far as I can tell it looks like your host may be missing some of the cgroups that Nomad needs to properly isolate processes and shutdown groups of raw_exec
processes cleanly.
Depending on whether you're on cgroups v1 or v2, you might be missing the pid
or freezer
cgroup. Take a look at the cgroup controllers requirements docs which has both how to diagnose what you might be missing and how to fix it.
Hi @tgross I appreciate you looking into this. I'll take a look at your docs, but just to note I believe the regression occurred upgrading from (I think) Nomad 1.6 and 1.7 series on various nodes. Happy for you to close this issue.
Ok, I'm going to close this out. If you find that you do have all the relevant cgroup controllers, we'll be happy to re-open.
Nomad version
1.8.1
Operating system and Environment details
Debian Linux on ARM
Issue
"unable to configure cgroups: no such file or directory" for a task with a raw_exec driver
Reproduction steps
Here is the task:
The full message I am receiving is:
Expected Result
The init task end successfully
Actual Result
The init task prevents the main task from starting
Potentially related issue: https://github.com/hashicorp/nomad/issues/23250