Open vrubiella opened 4 years ago
Hi @vrubiella, thanks for reporting this.
I haven't been able to reproduce the issue so far, but I agree that the behavior shown in your logs is definitely wrong. Is there any chance that you could post your nagios.cfg as well as any configuration for the service/templates used to create "SMTP 578"? Feel free to scrub out any credentials or sensitive information, of course.
Yes, this is config files, almost all values are default values: nagios.cfg
cat nagios.cfg
log_file=/var/log/nagios/nagios.log
cfg_file=/usr/local/nagios/etc/objects/templates.cfg
cfg_file=/usr/local/nagios/etc/objects/commands.cfg
cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
object_cache_file=/usr/local/nagios/var/objects.cache
precached_object_file=/usr/local/nagios/var/objects.precache
resource_file=/usr/local/nagios/etc/resource.cfg
status_file=/usr/local/nagios/var/status.dat
status_update_interval=10
nagios_user=nagios
nagios_group=nagios
check_external_commands=1
command_file=/usr/local/nagios/var/rw/nagios.cmd
lock_file=/run/nagios.lock
temp_file=/usr/local/nagios/var/nagios.tmp
temp_path=/tmp
event_broker_options=-1
broker_module=/usr/local/lib/mk-livestatus/livestatus.o /usr/local/nagios/var/rw/live
log_rotation_method=d
log_archive_path=/usr/local/nagios/var/archives
use_syslog=1
log_notifications=1
log_service_retries=1
log_host_retries=1
log_event_handlers=1
log_initial_states=0
log_current_states=1
log_external_commands=1
log_passive_checks=1
service_inter_check_delay_method=s
max_service_check_spread=30
service_interleave_factor=s
host_inter_check_delay_method=s
max_host_check_spread=30
check_result_reaper_frequency=10
max_check_result_reaper_time=30
check_result_path=/usr/local/nagios/var/spool/checkresults
max_check_result_file_age=3600
cached_host_check_horizon=15
cached_service_check_horizon=15
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
soft_state_dependencies=0
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
service_check_timeout=60
host_check_timeout=30
event_handler_timeout=30
notification_timeout=30
ocsp_timeout=5
ochp_timeout=5
perfdata_timeout=5
retain_state_information=1
state_retention_file=/usr/local/nagios/var/retention.dat
retention_update_interval=60
use_retained_program_state=1
use_retained_scheduling_info=1
retained_host_attribute_mask=0
retained_service_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
interval_length=60
check_for_updates=1
bare_update_check=0
use_aggressive_host_checking=0
execute_service_checks=1
accept_passive_service_checks=1
execute_host_checks=1
accept_passive_host_checks=1
enable_notifications=1
enable_event_handlers=1
obsess_over_services=0
obsess_over_hosts=0
translate_passive_host_checks=0
passive_host_checks_are_soft=0
check_for_orphaned_services=1
check_for_orphaned_hosts=1
check_service_freshness=1
service_freshness_check_interval=60
service_check_timeout_state=c
check_host_freshness=0
host_freshness_check_interval=60
additional_freshness_latency=15
enable_flap_detection=1
low_service_flap_threshold=5.0
high_service_flap_threshold=20.0
low_host_flap_threshold=5.0
high_host_flap_threshold=20.0
date_format=us
illegal_object_name_chars=`~!$%^&*|'"<>?,()=
illegal_macro_output_chars=`~$&|'"<>
use_regexp_matching=0
use_true_regexp_matching=0
admin_email=nagios@localhost
admin_pager=pagenagios@localhost
daemon_dumps_core=0
use_large_installation_tweaks=0
enable_environment_macros=0
debug_level=0
debug_verbosity=1
debug_file=/usr/local/nagios/var/nagios.debug
max_debug_file_size=1000000
allow_empty_hostgroup_assignment=0
#Nagflux + Influx + Grafana
#disabled
process_performance_data=0
host_perfdata_file=/usr/local/nagios/var/host-perfdata
host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$
host_perfdata_file_mode=a
host_perfdata_file_processing_interval=15
host_perfdata_file_processing_command=process-host-perfdata-file-nagflux
service_perfdata_file=/usr/local/nagios/var/service-perfdata
service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$
service_perfdata_file_mode=a
service_perfdata_file_processing_interval=15
service_perfdata_file_processing_command=process-service-perfdata-file-nagflux
#Custom values
max_concurrent_checks=1000
cfg_dir=/usr/local/nagios/etc/objects/salt-managed
cfg_file=/usr/local/nagios/etc/objects/nagflux-commands.cfg
Service
define service {
use uptime
hostgroup_name fnad,vxad
servicegroups mail
service_description SMTP 578
check_command check__smtp578
}
Sample host
define host {
use uptime
host_name vxada.srv.cat
hostgroups vxad, category_99999
alias vxada
address vxada.srv.cat
register 1
}
Command:
define command {
command_name check__smtp587
command_line $USER1$/check_smtp -w 2 -c 19 -t 20 -p 587 -H $HOSTADDRESS$ $ARG1$
}
Templates
define host {
use generic-host
name uptime
check_interval 2
retry_interval 1
check_period 24x7
register 0
process_perf_data 1
max_check_attempts 3
notification_interval 120
check_command check__host
contact_groups slack
action_url XXXXhost=$HOSTNAME$
}
define service {
use generic-service
name uptime
check_interval 1
retry_interval 1
check_period 24x7
register 0
process_perf_data 1
max_check_attempts 2
notification_interval 120
contact_groups slack
action_url XXXXX$SERVICEDESC$
}
I've finally had a chance to do some testing.
Here are the changes I made from the posted config
I'll post the configs I used here. There's more substantial stuff towards the end.
Here's nagios.cfg:
log_file=/usr/local/nagios/var/nagios.log
cfg_file=/usr/local/nagios/etc/objects/templates.cfg
cfg_file=/usr/local/nagios/etc/objects/commands.cfg
cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
object_cache_file=/usr/local/nagios/var/objects.cache
precached_object_file=/usr/local/nagios/var/objects.precache
resource_file=/usr/local/nagios/etc/resource.cfg
status_file=/usr/local/nagios/var/status.dat
status_update_interval=10
nagios_user=nagios
nagios_group=nagios
check_external_commands=1
command_file=/usr/local/nagios/var/rw/nagios.cmd
lock_file=/run/nagios.lock
temp_file=/usr/local/nagios/var/nagios.tmp
temp_path=/tmp
event_broker_options=-1
#broker_module=/usr/local/lib/mk-livestatus/livestatus.o /usr/local/nagios/var/rw/live
log_rotation_method=d
log_archive_path=/usr/local/nagios/var/archives
use_syslog=1
log_notifications=1
log_service_retries=1
log_host_retries=1
log_event_handlers=1
log_initial_states=0
log_current_states=1
log_external_commands=1
log_passive_checks=1
service_inter_check_delay_method=s
max_service_check_spread=30
service_interleave_factor=s
host_inter_check_delay_method=s
max_host_check_spread=30
check_result_reaper_frequency=10
max_check_result_reaper_time=30
check_result_path=/usr/local/nagios/var/spool/checkresults
max_check_result_file_age=3600
cached_host_check_horizon=15
cached_service_check_horizon=15
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
soft_state_dependencies=0
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
service_check_timeout=60
host_check_timeout=30
event_handler_timeout=30
notification_timeout=30
ocsp_timeout=5
ochp_timeout=5
perfdata_timeout=5
retain_state_information=1
state_retention_file=/usr/local/nagios/var/retention.dat
retention_update_interval=60
use_retained_program_state=1
use_retained_scheduling_info=1
retained_host_attribute_mask=0
retained_service_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
interval_length=60
check_for_updates=1
bare_update_check=0
use_aggressive_host_checking=0
execute_service_checks=1
accept_passive_service_checks=1
execute_host_checks=1
accept_passive_host_checks=1
enable_notifications=1
enable_event_handlers=1
obsess_over_services=0
obsess_over_hosts=0
translate_passive_host_checks=0
passive_host_checks_are_soft=0
check_for_orphaned_services=1
check_for_orphaned_hosts=1
check_service_freshness=1
service_freshness_check_interval=60
service_check_timeout_state=c
check_host_freshness=0
host_freshness_check_interval=60
additional_freshness_latency=15
enable_flap_detection=1
low_service_flap_threshold=5.0
high_service_flap_threshold=20.0
low_host_flap_threshold=5.0
high_host_flap_threshold=20.0
date_format=us
illegal_object_name_chars=`~!$%^&*|'"<>?,()=
illegal_macro_output_chars=`~$&|'"<>
use_regexp_matching=0
use_true_regexp_matching=0
admin_email=nagios@localhost
admin_pager=pagenagios@localhost
daemon_dumps_core=0
use_large_installation_tweaks=0
enable_environment_macros=0
debug_level=2
debug_verbosity=2
debug_file=/usr/local/nagios/var/nagios.debug
max_debug_file_size=1000000
allow_empty_hostgroup_assignment=0
#Nagflux + Influx + Grafana
#disabled
process_performance_data=0
host_perfdata_file=/usr/local/nagios/var/host-perfdata
host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$
host_perfdata_file_mode=a
host_perfdata_file_processing_interval=15
host_perfdata_file_processing_command=process-host-perfdata-file-nagflux
service_perfdata_file=/usr/local/nagios/var/service-perfdata
service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$
service_perfdata_file_mode=a
service_perfdata_file_processing_interval=15
service_perfdata_file_processing_command=process-service-perfdata-file-nagflux
#Custom values
max_concurrent_checks=1000
#cfg_dir=/usr/local/nagios/etc/objects/salt-managed
#cfg_file=/usr/local/nagios/etc/objects/nagflux-commands.cfg
cfg_file=/usr/local/nagios/etc/objects/vrubiella-samples.cfg
cfg_file=/usr/local/nagios/etc/objects/vrubiella-samples-groups.cfg
here's vrubiella-samples.cfg:
define command {
command_name check__smtp578
command_line $USER1$/check_smtp -w 2 -c 19 -t 20 -p 578 -H $HOSTADDRESS$ $ARG1$
}
define command {
command_name check__host
command_line /bin/echo "host was checked :)"
}
define host {
use generic-host
name uptime
check_interval 2
retry_interval 1
check_period 24x7
register 0
process_perf_data 1
max_check_attempts 3
notification_interval 120
check_command check__host
contact_groups slack
action_url XXXXhost=$HOSTNAME$
}
define service {
use generic-service
name uptime
check_interval 1
retry_interval 1
check_period 24x7
register 0
process_perf_data 1
max_check_attempts 2
notification_interval 120
contact_groups slack
action_url XXXXX$SERVICEDESC$
}
define host {
use uptime
host_name vxada.srv.cat
hostgroups vxad, category_99999
alias vxada
address vxada.srv.cat
register 1
}
define service {
use uptime
hostgroup_name fnad,vxad
servicegroups mail
service_description SMTP 578
check_command check__smtp578
}
and here's vrubiella-samples-groups.cfg:
define contactgroup {
contactgroup_name slack
alias Placeholder (slack)
members nagiosadmin
}
define servicegroup {
servicegroup_name mail
alias Placeholder (mail)
}
define hostgroup {
hostgroup_name vxad
alias Placeholder (vxad)
}
define hostgroup {
hostgroup_name category_99999
alias Placeholder (category_99999)
}
define hostgroup {
hostgroup_name fnad
alias Placeholder (fnad)
}
To simulate the misbehaving smtp server, here's my "check_smtp":
#!/bin/bash
echo "SMTP OK - .109 seconds"
#sleep 1000
#echo "SMTP Probably not okay"
#exit 2
I also left in all of the default objects in the objects directory (so contacts/commands/templates/timeperiods are all defined).
When I set this up on Core 4.4.5, here are my own results:
[root@localhost repos]# cat /usr/local/nagios/var/nagios.log | grep "SMTP 578"
[1582145109] wproc: host=vxada.srv.cat; service=SMTP 578;
[1582145109] Warning: Check of service 'SMTP 578' on host 'vxada.srv.cat' timed out after 61.046s!
[1582145109] SERVICE ALERT: vxada.srv.cat;SMTP 578;CRITICAL;SOFT;1;(Service check timed out after 61.05 seconds)
[1582145169] SERVICE ALERT: vxada.srv.cat;SMTP 578;OK;HARD;2;SMTP OK - .109 seconds
[1582145229] SERVICE ALERT: vxada.srv.cat;SMTP 578;CRITICAL;SOFT;1;SMTP Probably not okay
[1582145289] SERVICE NOTIFICATION: nagiosadmin;vxada.srv.cat;SMTP 578;CRITICAL;notify-service-by-email;SMTP Probably not okay
[1582145289] SERVICE ALERT: vxada.srv.cat;SMTP 578;CRITICAL;HARD;2;SMTP Probably not okay
[1582145469] SERVICE NOTIFICATION: nagiosadmin;vxada.srv.cat;SMTP 578;OK;notify-service-by-email;SMTP OK - .109 seconds
[1582145469] SERVICE ALERT: vxada.srv.cat;SMTP 578;OK;HARD;2;SMTP OK - .109 seconds
which unfortunately doesn't reproduce your issue. I see two separate problems here:
(I also verified that setting max_check_attempts to 1 didn't cause the behavior you reported on the machine I was using).
So from my perspective, here are the next steps:
Has any progress been made on this issue? Is it possible it only happens when the service HARD state is triggered due to the host being in a DOWN state (I think that's what I am seeing in some logs where I am observing this same problem)?
Hello, I have the same problem, when the service goes down into a HARD state due to machine unreachable and 1 attempt of service failed check, when it recovers it is wrongly logged has a SOFT state, but Nagios engine behave as if it was a HARD state (that is the right state) and then do not retry. So only one SOFT UP state is logged until the next day.
I'm have problems with availability reports in "hard" state because when some services was recovered from CRITICAL HARD state, remain in SOFT OK state until next DAY.
I see related issues with soft/hard states in previous releases, but I'm on last stable release 4.4.5
Log entries example: Service "SMTP 578" recovers from HARD critical at 4.44 PM as SOFT OK, and HARD OK is not labeled to next day, no other logs is registered about this service:
Next Day:
Note: I'm using daily log rotation.