Closed jessereynolds closed 10 years ago
@ali-graham has coded two versions of a fix to this issue. The first one (https://github.com/flpjck/flapjackfeeder/tree/5cb96a2121b12d96c04494526b31bfa03655c26d) segfaulted and I argued that the malloc could be the cause for that. But after the rework from @ali-graham that second version (https://github.com/flpjck/flapjackfeeder/tree/cab6e1bd5f4f296c8b38fb6ca0b18001e2b8393b) shows the same behavior.
root@flapjack:~# strace -Ff /usr/sbin/nagios3 /etc/nagios3/nagios.cfg
...
[pid 8647] sendto(3, "<14>Apr 15 03:58:30 nagios3: HOS"..., 100, MSG_NOSIGNAL, NULL, 0) = 100
[pid 8647] open("/var/log/nagios3/nagios.log", O_RDWR|O_CREAT|O_APPEND, 0666) = 8
[pid 8647] fstat(8, {st_mode=S_IFREG|0644, st_size=5808, ...}) = 0
[pid 8647] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc62542d000
[pid 8647] write(8, "[1397559510] HOST ALERT: localho"..., 84) = 84
[pid 8647] close(8) = 0
[pid 8647] munmap(0x7fc62542d000, 4096) = 0
[pid 8647] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8647] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8647] write(4, "*3\r\n$5\r\nLPUSH\r\n$6\r\nevents\r\n$209\r"..., 244) = 244
[pid 8647] read(4, ":1\r\n", 16384) = 4
[pid 8647] unlink("/var/lib/nagios3/spool/checkresults/cBwMfWB") = -1 ENOENT (No such file or directory)
[pid 8647] unlink("/var/lib/nagios3/spool/checkresults/cBwMfWB.ok") = -1 ENOENT (No such file or directory)
[pid 8647] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8647] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8647] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8647] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8647] --- SIGSEGV (Segmentation fault) @ 0 (0) ---
[pid 8647] sendto(3, "<14>Apr 15 03:58:30 nagios3: Cau"..., 61, MSG_NOSIGNAL, NULL, 0) = 61
[pid 8647] open("/var/log/nagios3/nagios.log", O_RDWR|O_CREAT|O_APPEND, 0666) = 8
[pid 8647] fstat(8, {st_mode=S_IFREG|0644, st_size=5892, ...}) = 0
[pid 8647] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc62542d000
[pid 8647] write(8, "[1397559510] Caught SIGSEGV, shu"..., 46) = 46
[pid 8647] close(8) = 0
[pid 8647] munmap(0x7fc62542d000, 4096) = 0
[pid 8647] write(1, "Caught SIGSEGV, shutting down...", 32Caught SIGSEGV, shutting down...) = 32
[pid 8647] write(1, "\n", 1
) = 1
[pid 8647] rt_sigreturn(0x7fc62476e720) = 0
[pid 8647] --- SIGSEGV (Segmentation fault) @ 0 (0) ---
[pid 8647] exit_group(-2) = ?
root@flapjack:~# tail /var/log/nagios3/nagios.log
[1397559499] Nagios 3.2.3 starting... (PID=8647)
[1397559499] Local time is Tue Apr 15 03:58:19 PDT 2014
[1397559499] LOG VERSION: 2.0
[1397559499] flapjackfeeder: Copyright (c) 2013 Birger Schmidt, derived from npcdmod
[1397559499] flapjackfeeder: redis host 'localhost'.
[1397559499] flapjackfeeder: redis port '6380'.
[1397559499] flapjackfeeder: Ready to run to have some fun!
[1397559499] Event broker module '/usr/local/lib/flapjackfeeder.o' initialized successfully.
[1397559510] HOST ALERT: localhost;DOWN;SOFT;1;/bin/ping -n -U -w 10 -c 1 127.0.0.1
[1397559510] Caught SIGSEGV, shutting down...
The write to redis (LPUSH) works and the answer from redis is successfully read as well. But only once the segfault follows right after that :-(.
Same thing for icinga:
root@flapjack:~# strace -Ff /usr/sbin/icinga /etc/icinga/icinga.cfg
...
[pid 8775] unlink("/var/lib/icinga/spool/checkresults/c8zgqGQ") = 0
[pid 8775] unlink("/var/lib/icinga/spool/checkresults/c8zgqGQ.ok") = 0
[pid 8775] stat("/var/lib/icinga/spool/checkresults/cI5qsx2", {st_mode=S_IFREG|0600, st_size=488, ...}) = 0
[pid 8775] stat("/var/lib/icinga/spool/checkresults/cI5qsx2.ok", {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
[pid 8775] open("/var/lib/icinga/spool/checkresults/cI5qsx2", O_RDONLY) = 9
[pid 8775] fstat(9, {st_mode=S_IFREG|0600, st_size=488, ...}) = 0
[pid 8775] mmap(NULL, 488, PROT_READ, MAP_PRIVATE, 9, 0) = 0x7ffa51de4000
[pid 8775] munmap(0x7ffa51de4000, 488) = 0
[pid 8775] close(9) = 0
[pid 8775] unlink("/var/lib/icinga/spool/checkresults/cI5qsx2") = 0
[pid 8775] unlink("/var/lib/icinga/spool/checkresults/cI5qsx2.ok") = 0
[pid 8775] getdents(8, /* 0 entries */, 32768) = 0
[pid 8775] brk(0x270a000) = 0x270a000
[pid 8775] close(8) = 0
[pid 8775] sendto(3, "<14>Apr 15 04:17:32 icinga: HOST"..., 99, MSG_NOSIGNAL, NULL, 0) = 99
[pid 8775] open("/var/log/icinga/icinga.log", O_RDWR|O_CREAT|O_APPEND, 0666) = 8
[pid 8775] fstat(8, {st_mode=S_IFREG|0644, st_size=30854, ...}) = 0
[pid 8775] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa51de4000
[pid 8775] write(8, "[1397560652] HOST ALERT: localho"..., 84) = 84
[pid 8775] close(8) = 0
[pid 8775] munmap(0x7ffa51de4000, 4096) = 0
[pid 8775] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8775] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8775] write(4, "*3\r\n$5\r\nLPUSH\r\n$6\r\nevents\r\n$211\r"..., 246) = 246
[pid 8775] read(4, ":1\r\n", 16384) = 4
[pid 8775] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8775] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8775] stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2819, ...}) = 0
[pid 8775] --- SIGSEGV (Segmentation fault) @ 0 (0) ---
[pid 8776] +++ killed by SIGSEGV +++
root@flapjack:~# tail /var/log/icinga/icinga.log
[1397560642] Icinga 1.6.1 starting... (PID=8775)
[1397560642] Local time is Tue Apr 15 04:17:22 PDT 2014
[1397560642] LOG VERSION: 2.0
[1397560642] flapjackfeeder: Copyright (c) 2013 Birger Schmidt, derived from npcdmod
[1397560642] flapjackfeeder: redis host 'localhost'.
[1397560642] flapjackfeeder: redis port '6380'.
[1397560642] flapjackfeeder: Ready to run to have some fun!
[1397560642] Event broker module '/usr/local/lib/flapjackfeeder.o' initialized successfully.
[1397560642] Event loop started...
[1397560652] HOST ALERT: localhost;DOWN;SOFT;1;/bin/ping -n -U -w 10 -c 1 127.0.0.1
Yeah! I found the bug in the fix ;-) Often the long service output is empty and the escaping code failed in that case. The fix is pushed... Probably we could go back to the malloc code as well if we want to.
Excellent, nice work @bs-github
The malloc code probably commits less sins against good practice :+1: