prometheus / node_exporter

Exporter for machine metrics
https://prometheus.io/
Apache License 2.0
10.93k stars 2.33k forks source link

runit collector reports invalid (outdated) service state if runsv (supervisor process) is not running #3070

Open powerman opened 2 months ago

powerman commented 2 months ago

What did you do that produced an error?

Check runit collector sources:

https://github.com/prometheus-community/go-runit/blob/master/runit/runit.go#L82-L99

What did you expect to see?

It should open FIFO file ok on write before trying to read file status. This implements required check "is runsv supervisor running". Because if runsv is not running then contents of it's status file is outdated and does not reflect actual service state.

This can be fixed by adding this code:

 func (s *service) Status() (*SvStatus, error) {
+   file, err := os.OpenFile(s.file("ok"), os.O_WRONLY|syscall.O_NONBLOCK, 0)
+   if err != nil {
+       return nil, err
+   }
+   _ = file.Close()
    status, err := s.status()
    if err != nil {
        return nil, err
    }

The runit's sv tool does the same:

# strace sv status .
execve("/bin/sv", ["sv", "status", "."], 0x7ffcbff55670 /* 61 vars */) = 0
brk(NULL)                               = 0x5639352a8000
openat(AT_FDCWD, "/usr/lib64/libwcwidth-icons.so", O_RDONLY|O_CLOEXEC) = 4
read(4, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
fstat(4, {st_mode=S_IFREG|0755, st_size=14192, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f1c31876000
mmap(NULL, 16408, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 4, 0) = 0x7f1c31871000
mmap(0x7f1c31872000, 4096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x1000) = 0x7f1c31872000
mmap(0x7f1c31873000, 4096, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x2000) = 0x7f1c31873000
mmap(0x7f1c31874000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x2000) = 0x7f1c31874000
close(4)                                = 0
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=164390, ...}) = 0
mmap(NULL, 164390, PROT_READ, MAP_PRIVATE, 4, 0) = 0x7f1c31848000
close(4)                                = 0
openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 4
read(4, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d\2\0\0\0\0\0"..., 832) = 832
pread64(4, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
fstat(4, {st_mode=S_IFREG|0755, st_size=1855744, ...}) = 0
pread64(4, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
mmap(NULL, 1887088, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 4, 0) = 0x7f1c3167b000
mmap(0x7f1c3169f000, 1343488, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x24000) = 0x7f1c3169f000
mmap(0x7f1c317e7000, 339968, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x16c000) = 0x7f1c317e7000
mmap(0x7f1c3183a000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x1be000) = 0x7f1c3183a000
mmap(0x7f1c31840000, 31600, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f1c31840000
close(4)                                = 0
mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f1c31678000
arch_prctl(ARCH_SET_FS, 0x7f1c31678740) = 0
set_tid_address(0x7f1c31678a10)         = 455
set_robust_list(0x7f1c31678a20, 24)     = 0
rseq(0x7f1c31679060, 0x20, 0, 0x53053053) = 0
mprotect(0x7f1c3183a000, 16384, PROT_READ) = 0
mprotect(0x7f1c31874000, 4096, PROT_READ) = 0
mprotect(0x56393432d000, 4096, PROT_READ) = 0
mprotect(0x7f1c318a9000, 8192, PROT_READ) = 0
prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
munmap(0x7f1c31848000, 164390)          = 0
openat(AT_FDCWD, ".", O_RDONLY|O_NONBLOCK) = 4
chdir(".")                              = 0
openat(AT_FDCWD, "supervise/ok", O_WRONLY|O_NONBLOCK) = 5
close(5)                                = 0
openat(AT_FDCWD, "supervise/status", O_RDONLY|O_NONBLOCK) = 5
read(5, "@\0\0\0f|\273\262\30N\370\24\307\3\0\0\0u\0\1", 20) = 20
close(5)                                = 0
newfstatat(AT_FDCWD, "down", 0x7ffd0dedaba0, 0) = -1 ENOENT (No such file or directory)
chdir("log")                            = 0
openat(AT_FDCWD, "supervise/ok", O_WRONLY|O_NONBLOCK) = 5
close(5)                                = 0
openat(AT_FDCWD, "supervise/status", O_RDONLY|O_NONBLOCK) = 5
read(5, "@\0\0\0f|\273\262\30J\\d\306\3\0\0\0u\0\1", 20) = 20
close(5)                                = 0
newfstatat(AT_FDCWD, "down", 0x7ffd0dedaba0, 0) = -1 ENOENT (No such file or directory)
write(1, "run: .: (pid 967) 551089s; run: "..., 55run: .: (pid 967) 551089s; run: log: (pid 966) 551089s
) = 55
fchdir(4)                               = 0
exit_group(0)                           = ?
+++ exited with 0 +++
# 

What did you see instead?

File ok is not used by this collector at all.

discordianfish commented 1 month ago

The runit collector is deprecated. I'm not against merging a simple fix though if you want to take a stab at it

powerman commented 1 month ago

The runit collector is deprecated.

Is there any alternative/replacement for it?

SuperQ commented 1 month ago

Nobody has published a separate exporter that I know of yet. But we don't plan to do this work here.