Closed amuhametov closed 1 month ago
right, this issue needs to be fixed in livestatus. We did that a few years ago in Naemon as well. We changed naemons livestatus to send the command to the query handler instead of injecting it directly to avoid race conditions.
@sni got issue link?
no, you need to fix nagios yourselves
@sni What do you think about this?
open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK) = -1 ENXIO (No such device or address)
writev(2, [{"*** Error in `", 14}, {"/usr/bin/nagios", 15}, {"': ", 3}, {"double free or corruption (fasttop)", 35}, {": 0x", 4}, {"00000000018c7960", 16}, {" ***\n", 5}], 7) = 92
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f5997eed000
mmap(NULL, 134217728, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f57d4000000
munmap(0x7f57d8000000, 67108864) = 0
mprotect(0x7f57d4000000, 135168, PROT_READ|PROT_WRITE) = 0
futex(0x7f59977c7d00, FUTEX_WAKE_PRIVATE, 2147483647) = 0
futex(0x7f5994702190, FUTEX_WAKE_PRIVATE, 2147483647) = 0
write(2, "======= Backtrace: =========\n", 29) = 29
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"+0x", 3}, {"81499", 5}, {")", 1}, {"[0x", 3}, {"7f599747e499", 12}, {"]\n", 2}], 8) = 43
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"my_strtok_with_free", 19}, {"+0x", 3}, {"af", 2}, {")", 1}, {"[0x", 3}, {"4177ef", 6}, {"]\n", 2}], 9) = 52
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"parse_check_output", 18}, {"+0x", 3}, {"410", 3}, {")", 1}, {"[0x", 3}, {"4244d5", 6}, {"]\n", 2}], 9) = 52
writev(2, [{"/usr/bin/nagios", 15}, {"[0x", 3}, {"41e967", 6}, {"]\n", 2}], 4) = 26
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"handle_async_service_check_result", 33}, {"+0x", 3}, {"186", 3}, {")", 1}, {"[0x", 3}, {"41f82b", 6}, {"]\n", 2}], 9) = 67
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"process_check_result", 20}, {"+0x", 3}, {"f0", 2}, {")", 1}, {"[0x", 3}, {"44f9de", 6}, {"]\n", 2}], 9) = 53
writev(2, [{"/usr/lib64/mod_gearman/mod_gearman_nagios4.o", 44}, {"(", 1}, {"+0x", 3}, {"e2da", 4}, {")", 1}, {"[0x", 3}, {"7f5993d8b2da", 12}, {"]\n", 2}], 8) = 70
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"neb_make_callbacks", 18}, {"+0x", 3}, {"b1", 2}, {")", 1}, {"[0x", 3}, {"41722d", 6}, {"]\n", 2}], 9) = 51
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"broker_timed_event", 18}, {"+0x", 3}, {"a2", 2}, {")", 1}, {"[0x", 3}, {"4149a0", 6}, {"]\n", 2}], 9) = 51
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"handle_timed_event", 18}, {"+0x", 3}, {"5b", 2}, {")", 1}, {"[0x", 3}, {"438960", 6}, {"]\n", 2}], 9) = 51
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"event_execution_loop", 20}, {"+0x", 3}, {"38f", 3}, {")", 1}, {"[0x", 3}, {"43887d", 6}, {"]\n", 2}], 9) = 54
writev(2, [{"/usr/bin/nagios", 15}, {"(", 1}, {"main", 4}, {"+0x", 3}, {"fc6", 3}, {")", 1}, {"[0x", 3}, {"41464e", 6}, {"]\n", 2}], 9) = 38
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"__libc_start_main", 17}, {"+0x", 3}, {"f5", 2}, {")", 1}, {"[0x", 3}, {"7f599741f445", 12}, {"]\n", 2}], 9) = 57
writev(2, [{"/usr/bin/nagios", 15}, {"[0x", 3}, {"412fa9", 6}, {"]\n", 2}], 4) = 26
write(2, "======= Memory map: ========\n", 29) = 29
open("/proc/self/maps", O_RDONLY) = 57
My bad. Proposed patch did not apply. Will try to backport command query handler from naemon.
Hi, @amuhametov. Just wanted to let you know that @hedenface and I are on holiday for Thanksgiving. My home environment doesn't really have the capabilities for this type of development work, so I'll try to look into this next week. That said, please let us know if you make any progress. We appreciate the effort you've put into this.
Obviously, calling process_external_command1()
from a broker is not a good idea and leads to a race.
I've applied simple patch originally introduced at naemon-livestatus https://github.com/naemon/naemon-livestatus/commit/94b773c1b4014814a79081db131a433f8399069a#diff-5d94690a6dc25ee5c765d4239af5ee93 to ck-mk-livestatus and it works for me with no nagios modifications required (thanks, @sni).
Also, I've tested https://github.com/naemon/naemon-livestatus/commit/2b9f4f8c9895911fd1f606a31e2b006ceb42eadf, it works fine but requires modification of both nagios and ck-mk-livestatus:
diff --git a/base/query-handler.c b/base/query-handler.c
index 9e67a3cc..28505d2d 100644
--- a/base/query-handler.c
+++ b/base/query-handler.c
@@ -469,6 +469,33 @@ static int qh_core(int sd, char *buf, unsigned int len)
return 404;
}
+static int qh_command(int sd, char *buf, unsigned int len)
+{
+ char *space;
+
+ if (!*buf || !strcmp(buf, "help")) {
+ nsock_printf_nul(sd, "Query handler for naemon commands.\n"
+ "Available commands:\n"
+ " run <command> Run a command\n"
+ );
+ return 0;
+ }
+ if ((space = memchr(buf, ' ', len)))
+ * (space++) = 0;
+ if (space) {
+ int res = process_external_command1(space);
+ if (res == OK) {
+ return 200;
+ } else {
+ nsock_printf_nul(sd, "%d: %s", res, qh_strerror(res));
+ return 0;
+ }
+ }
+
+ nsock_printf_nul(sd, "%d: %s", 404, qh_strerror(404));
+ return 404;
+}
+
int qh_init(const char *path)
{
int result = 0;
@@ -540,5 +567,10 @@ int qh_init(const char *path)
logit(NSLOG_INFO_MESSAGE, FALSE, "qh: help for the query handler registered\n");
}
+ result = qh_register_handler("command", "Nagios external commands interface", 0, qh_command);
+ if (result == OK) {
+ logit(NSLOG_INFO_MESSAGE, FALSE, "qh: command service query handler registered\n");
+ }
+
return 0;
}
process_external_command1()
. Requires query_socket nagios.conf option to be set explicitly.--- livestatus.src.orig/src/Store.cc 2018-04-23 16:47:51.000000000 +0300
+++ livestatus.src/src/Store.cc 2018-11-28 17:15:06.604769980 +0300
@@ -207,7 +207,23 @@
lock_guard<mutex> lg(_command_mutex);
#ifdef NAGIOS4
- process_external_command1((char *)command);
+ int ret, sd;
+ char *buf;
+
+ sd = nsock_unix(qh_socket_path, NSOCK_TCP | NSOCK_CONNECT);
+ if (sd < 0) {
+ Informational(logger()) << "Failed to connect to query socket '" << qh_socket_path << "\n"; // << "': " << nsock_strerror(sd) << ": " << strerror(errno);
+ return;
+ }
+ ret = nsock_printf_nul(sd, "#command run %s", command);
+ if (ret < 0) {
+ Informational(logger()) << "failed to submit command by query handler";
+ }
+ while(read(sd, buf, 1024) > 0) {
+ Informational(logger()) << "query handler: " << buf << "\n";
+ }
+ close(sd);
+ return;
#else
int buffer_items = -1;
/* int ret = */
[root@koji1 check_mk-1.4.0p31]# diff -ruN --exclude=Makefile livestatus.src.orig/src/Store.cc livestatus.src/src/Store.cc; diff -u livestatus.src.orig/src/NagiosMockup.cc livestatus.src/src/NagiosMockup.cc
--- livestatus.src.orig/src/Store.cc 2018-04-23 16:47:51.000000000 +0300
+++ livestatus.src/src/Store.cc 2018-11-28 17:15:06.604769980 +0300
@@ -207,7 +207,23 @@
lock_guard<mutex> lg(_command_mutex);
#ifdef NAGIOS4
- process_external_command1((char *)command);
+ int ret, sd;
+ char *buf;
+
+ sd = nsock_unix(qh_socket_path, NSOCK_TCP | NSOCK_CONNECT);
+ if (sd < 0) {
+ Informational(logger()) << "Failed to connect to query socket '" << qh_socket_path << "\n"; // << "': " << nsock_strerror(sd) << ": " << strerror(errno);
+ return;
+ }
+ ret = nsock_printf_nul(sd, "#command run %s", command);
+ if (ret < 0) {
+ Informational(logger()) << "failed to submit command by query handler";
+ }
+ while(read(sd, buf, 1024) > 0) {
+ Informational(logger()) << "query handler: " << buf << "\n";
+ }
+ close(sd);
+ return;
#else
int buffer_items = -1;
/* int ret = */
--- livestatus.src.orig/src/NagiosMockup.cc 2018-04-23 16:47:51.000000000 +0300
+++ livestatus.src/src/NagiosMockup.cc 2018-11-27 22:50:54.371739007 +0300
@@ -1,4 +1,6 @@
#include <ctime>
+#include <stdarg.h>
+
extern "C" {
// dummy types -----------------------------------------------------------------
@@ -101,6 +103,24 @@
service *service_list;
timeperiod *timeperiod_list;
+char *qh_socket_path;
+const char *nsock_strerror(int code)
+{
+ return "Unknown error";
+}
+
+int nsock_printf_nul(int, char const*, ...)
+//int nsock_printf_nul(int sd, const char *fmt, ...)
+{
+ return 0;
+}
+
+int nsock_unix(const char *path, unsigned int flags)
+{
+ return 0;
+}
+
+
// imports ---------------------------------------------------------------------
int nebmodule_init(int flags, char *args, void *handle);
Anyway this issue is not a pure nagios "bug", but broker<->nagios interaction issue.
cant repeat solution from amuhametov... :/ Nagios QH command started and working, beside livestatus cant be compiled.... undefined refenrence nsock_unix (
@ArDark2 which versions of nagios and livestatus you are trying to compile?
nagios 4.4.3
Livestatus 1.5.0p10
Commands send directly to nagios processed without any trouble, to i suppose its only livestatus and my curved arms :)
printf "@command run [1549535266] SCHEDULE_FORCED_SVC_CHECK;127.0.0.1;_LUN_1;1549535266\0" | unixcat /usr/local/nagios/tmp/nagios/nagios.qh
___ADDED:
also try with livestatus 1.4.0p31.... same errors on compile:
livestatus.o: undefined reference to nsock_printf_nul(int, char const*, ...)' livestatus.o: undefined reference to
nsock_unix(char const*, unsigned int)'
moved settings in NagiosMockup.cc to extern "C++" { int nsock_unix(const char _path, unsigned int flags) { return 0; }; int nsock_printf_nul(int , char const , ... ) { return 0; }; } Compiled without error, but when starting nagios not in daemon mode, get error: livestatus.o: undefined symbol: _Z10nsock_unixPKcj this happening on this line: sd = nsock_unix( qh_socket_path , NSOCK_TCP | NSOCK_CONNECT ); ___ADDED: tried "small path", its working. but lookin forward for QH, as more interesting and stable(?) solution
@amuhametov - im not find difference in file with https://github.com/NagiosEnterprises/nagioscore/issues/593#issuecomment-442464482 Is it the same? if so, its not working.... PS: query handler successfuly started in nagios process, and i can query it.
@amuhametov - Thanks again At least get it working with nagios 4.4.3 and livestatus 1.5.0p10 ! p.1. - unchanged for p.2 it should be as below:
--------------------------diff -ruN --exclude=Makefile NagiosMockup.cc.orig NagiosMockup.cc
--- NagiosMockup.cc.orig 2019-01-07 15:35:31.000000000 +0300
+++ NagiosMockup.cc 2019-03-07 16:24:54.946623200 +0300
@@ -101,6 +101,9 @@
service *service_list;
timeperiod *timeperiod_list;
+char *qh_socket_path;
+
+
// imports ---------------------------------------------------------------------
int nebmodule_init(int flags, char *args, void *handle);
--------------------------diff -ruN --exclude=Makefile Store.cc.orig Store.cc
--- Store.cc.orig 2019-01-07 15:35:31.000000000 +0300
+++ Store.cc 2019-03-07 16:23:57.773117663 +0300
@@ -43,6 +43,10 @@
#include "Table.h"
#include "mk_logwatch.h"
+#include "../nagios4/lib/nsock.c"
+#include "../nagios4/lib/nsock.h"
+
+
Store::Store(MonitoringCore *mc)
: _mc(mc)
, _log_cache(mc, mc->maxCachedMessages())
@@ -294,7 +298,23 @@
// The Nagios headers are (once again) not const-correct...
auto cmd = const_cast<char *>(command_str.c_str());
#ifdef NAGIOS4
- process_external_command1(cmd);
+ int ret, sd;
+ char *buf;
+
+ sd = nsock_unix(qh_socket_path, NSOCK_TCP | NSOCK_CONNECT);
+ if (sd < 0) {
+ Informational(logger()) << "Failed to connect to query socket '" << qh_socket_path << "\n"; // << "': " << nsock_strerror(sd) << ": " << strerror(errno);
+ return;
+ }
+ ret = nsock_printf_nul(sd, "#command run %s", cmd);
+ if (ret < 0) {
+ Informational(logger()) << "failed to submit command by query handler";
+ }
+ while(read(sd, buf, 1024) > 0) {
+ Informational(logger()) << "query handler: " << buf << "\n";
+ }
+ close(sd);
+ return;
#else
submit_external_command(cmd, nullptr);
#endif
And small patch for hosts with dot in names (actual for nagios 4.3.3)
------------------------------diff -ruN --exclude=Makefile RegExp.cc.orig RegExp.cc
--- RegExp.cc.orig 2019-01-07 15:35:31.000000000 +0300
+++ RegExp.cc 2019-03-07 16:22:21.490269005 +0300
@@ -79,7 +79,7 @@
Impl(const std::string &str, Case c, Syntax s)
: _regex(s == Syntax::literal
? std::regex_replace(
- str, std::regex(R"([.^$|()\[\]{}*+?\\])"), R"(\\&)",
+ str, std::regex(R"([.^$|()\[\]{}*+?\\])"), R"(\&)",
std::regex_constants::format_sed)
: str,
c == Case::respect
What I have installed: centos 7.5, all packages are up-to-date nagios-4.4.2 + patch proposed in https://github.com/NagiosEnterprises/nagioscore/issues/573 thruk-2.24 mod_gearman-3.0.6 + nagios 4.4 patch check-mk-livestatus-1.4.0p31 (epel)
The problem is nagios gets segfaulted/sigabrt almost every time I try to "balance all hosts and services" from thruk's core scheduling overview page.
Here is most recent backtrace (sigabrt after 'balance all hosts and services'):
And related nagios log:
Here is another backtrace (segfault afer 'balance all hosts and services'):
related nagios log:
@sni, I'm pretty sure this problem is not within mod_gearman. @hedenface, @Madlohe need your assistance. Definitely there is something I miss.