Icinga / icinga2

The core of our monitoring platform with a powerful configuration language and REST API.
https://icinga.com/docs/icinga2/latest
GNU General Public License v2.0
2k stars 574 forks source link

validation error for service to service dependency with parent_service_name declared as variable #7250

Closed czstrfr closed 5 years ago

czstrfr commented 5 years ago

Dear all,

I think we found a bug trying to dynamically declare service to service dependency using custom variable in parent_service_name receiving error State filter is invalid for host dependency.

To Reproduce

1.

apply Dependency "disable-ssh-checks-if-sid-down" to Service {
  parent_service_name = vars.sid2watch
  states = ["OK"]
  disable_checks = true
  disable_notifications = true
  ignore_soft_states = false
  assign where service.check_command == "by_ssh" && host.vars.os == "Solaris" && host.vars.sids2watch
}

2.

template Service "solaris-checks-service" {
  max_check_attempts = 3
  check_interval = 3m
  retry_interval = 100s
  check_command = "by_ssh"
  vars.by_ssh_logname = "icinga"
  vars.by_ssh_identity = "/var/lib/icinga2/api/zones/satellite1.example.net/_etc/authfiles/icinga2sun.key"
  vars.by_ssh_skip_stderr = "true"
  vars.by_ssh_quiet = "true"
  vars.by_ssh_timeout = "60"
  vars.by_ssh_ipv4 = "true"
  vars.by_ssh_options = "StrictHostKeyChecking=no"
}

3.

object Host "host1.example.net" {
  display_name = "DWH Oracle INT"
  import "generic-host"
  vars.os = "Solaris"
  vars.services = "oracledb"
  vars.orasids = ["dwhuqa","dwroqa"]
  vars.oraasm = "true"
  vars.parents = ["platform1.example.net"]
  address = "10.20.30.44"
  vars.sids2watch       ["dwhuqa"]={
                        procs_user = "oracle",
                        procs_warning = "",
                        procs_critical = "1:1",
                        procs_argument = "ora_pmon_dwhuqa"}
  vars.sids2watch       ["dwroqa"]={
                        procs_user = "oracle",
                        procs_warning = "",
                        procs_critical = "1:1",
                        procs_argument = "ora_pmon_dwroqa"}
}

4.

apply Service "sids2watch_service" for (sid_name => config in host.vars.sids2watch){
  import "solaris-checks-service"
  display_name = "oracle SID status: " + sid_name
  vars.by_ssh_command = "/opt/icinga/nagios-plugins/check_procs"
  vars += config
  vars.sid2watch = "sids2watch_service" +sid_name
  vars.by_ssh_arguments = {
                        "-w" = {
                                value = "$procs_warning$",
                                }
                        "-c" = {
                                value = "$procs_critical$",
                                }
                        "-u" = {
                                value = "$procs_user$",
                                }
                        "-a" = {
                                value = "$procs_argument$",
                                }
                          }
assign where host.vars.sids2watch

5.

[2019-06-18 18:17:01 +0200] critical/config: Error: Validation failed for object 'host1.example.net!oracle-blocking-sessions-check.shdwhuqa!disable-ssh-checks-if-sid-down' of type 'Dependency'; Attribute 'states': State filter is invalid for host dependency.
Location: in /etc/icinga2/zones.d/global-templates/dependencies.conf: 8:3-8:17
/etc/icinga2/zones.d/global-templates/dependencies.conf(6): apply Dependency "disable-ssh-checks-if-sid-down" to Service {
/etc/icinga2/zones.d/global-templates/dependencies.conf(7):   parent_service_name = vars.sid2watch
/etc/icinga2/zones.d/global-templates/dependencies.conf(8):   states = ["OK"]
                                                              ^^^^^^^^^^^^^^^
/etc/icinga2/zones.d/global-templates/dependencies.conf(9):   disable_checks = true
/etc/icinga2/zones.d/global-templates/dependencies.conf(10):   disable_notifications = true

6.

Object 'host1.example.net!oracle-blocking-sessions-check.shdwhuqa!disable-ssh-checks-if-sid-down' of type 'Dependency':
  % declared in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 6:1-6:36
  * __name = "host1.example.net!oracle-blocking-sessions-check.shdwhuqa!disable-ssh-checks-if-sid-down"
  * child_host_name = "host1.example.net"
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 6:1-6:36
  * child_service_name = "oracle-blocking-sessions-check.shdwhuqa"
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 6:1-6:36
  * disable_checks = true
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 9:3-9:23
  * disable_notifications = true
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 10:3-10:30
  * ignore_soft_states = false
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 11:3-11:28
  * name = "disable-ssh-checks-if-sid-down"
  * package = "_etc"
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 6:1-6:36
  * parent_host_name = "host1.example.net"
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 6:1-6:36
  * parent_service_name = "sids2watch_servicedwhuqa"
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 7:3-7:50
  * period = ""
  * source_location
    * first_column = 1
    * first_line = 6
    * last_column = 36
    * last_line = 6
    * path = "/etc/icinga2/zones.d/global-templates/dependencies.conf"
  * states = [ "OK" ]
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 8:3-8:17
  * templates = [ "disable-ssh-checks-if-sid-down" ]
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 6:1-6:36
  * type = "Dependency"
  * vars = null
  * zone = "satellite1.example.net"
    % = modified in '/etc/icinga2/zones.d/global-templates/dependencies.conf', lines 6:1-6:36

Expected behavior

Config validation should evaluate the variable in the very same way as explicitly declared service name, which pass the config validation and works as expected.

Your Environment

Additional context

If the parent_service_name is declared directly as servicename, config validation is successful and dependency is triggered if parent service change state from OK to any other state: 1.

apply Dependency "disable-ssh-checks-if-sid-down" to Service {
  parent_service_name = "sids2watch_servicedwhuqa"
  states = ["OK"]
  disable_checks = true
  disable_notifications = true
  ignore_soft_states = false
  assign where service.check_command == "by_ssh" && host.vars.os == "Solaris" && host.vars.sids2watch
}

2.

[root@master1.example.net global-templates]# icinga2 daemon -C
[2019-06-19 10:29:34 +0200] information/cli: Icinga application loader (version: r2.10.4-1)
[2019-06-19 10:29:34 +0200] information/cli: Loading configuration file(s).
[2019-06-19 10:29:34 +0200] information/ConfigItem: Committing config item(s).
[2019-06-19 10:29:34 +0200] information/ApiListener: My API identity: master1.example.net
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1375 Services.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 IcingaApplication.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 218 Hosts.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 FileLogger.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 20 Dependencies.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 2 NotificationCommands.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 21 Notifications.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 NotificationComponent.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 11 HostGroups.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 ApiListener.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 PerfdataWriter.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 9 Comments.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 CheckerComponent.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 102 Zones.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 ExternalCommandListener.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 100 Endpoints.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 2 ApiUsers.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 User.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 247 CheckCommands.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 IdoPgsqlConnection.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 1 UserGroup.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 6 ServiceGroups.
[2019-06-19 10:29:34 +0200] information/ConfigItem: Instantiated 3 TimePeriods.
[2019-06-19 10:29:34 +0200] information/ScriptGlobal: Dumping variables to file '/var/cache/icinga2/icinga2.vars'
[2019-06-19 10:29:34 +0200] information/cli: Finished validating the configuration file(s).

3.

Object 'host1.example.net!sids2watch_servicedwhuqa' of type 'Service':
  % declared in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 1:0-1:82
  * __name = "host1.example.net!sids2watch_servicedwhuqa"
  * action_url = ""
  * check_command = "by_ssh"
    % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 67:3-67:26
  * check_interval = 180
    % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 65:3-65:21
  * check_period = ""
  * check_timeout = null
  * command_endpoint = ""
  * display_name = "oracle SID status: dwhuqa"
    % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 3:3-3:49
  * enable_active_checks = true
  * enable_event_handler = true
  * enable_flapping = false
  * enable_notifications = true
  * enable_passive_checks = true
  * enable_perfdata = true
  * event_command = ""
  * flapping_threshold = 0
  * flapping_threshold_high = 30
  * flapping_threshold_low = 25
  * groups = [ ]
  * host_name = "host1.example.net"
    % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 1:0-1:82
  * icon_image = ""
  * icon_image_alt = ""
  * max_check_attempts = 3
    % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 64:3-64:24
  * name = "sids2watch_servicedwhuqa"
    % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 1:0-1:82
  * notes = ""
  * notes_url = ""
  * package = "_etc"
    % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 1:0-1:82
  * retry_interval = 100
    % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 66:3-66:23
  * source_location
    * first_column = 0
    * first_line = 1
    * last_column = 82
    * last_line = 1
    * path = "/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf"
  * templates = [ "sids2watch_servicedwhuqa", "solaris-checks-service" ]
    % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 1:0-1:82
    % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 63:1-63:41
  * type = "Service"
  * vars
    % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 5:3-5:16
    * by_ssh_arguments
      % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 7:3-20:6
      * -a
        * value = "$procs_argument$"
      * -c
        * value = "$procs_critical$"
      * -u
        * value = "$procs_user$"
      * -w
        * value = "$procs_warning$"
    * by_ssh_command = "/opt/icinga/nagios-plugins/check_procs"
      % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 4:3-4:64
    * by_ssh_identity = "/var/lib/icinga2/api/zones/satellite1.example.net/_etc/authfiles/icinga2sun.key"
      % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 69:3-69:108
    * by_ssh_ipv4 = "true"
      % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 73:3-73:27
    * by_ssh_logname = "icinga"
      % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 68:3-68:32
    * by_ssh_options = "StrictHostKeyChecking=no"
      % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 74:3-74:50
    * by_ssh_quiet = "true"
      % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 71:3-71:28
    * by_ssh_skip_stderr = "true"
      % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 70:3-70:34
    * by_ssh_timeout = "60"
      % = modified in '/etc/icinga2/zones.d/global-templates/templates.conf', lines 72:3-72:28
    * procs_argument = "ora_pmon_dwhuqa"
    * procs_critical = "1:1"
    * procs_user = "oracle"
    * procs_warning = ""
    * sid2watch = "sids2watch_servicedwhuqa"
      % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 6:3-6:49
  * volatile = false
  * zone = "satellite1.example.net"
    % = modified in '/etc/icinga2/zones.d/satellite1.example.net/services/sids2watch.conf', lines 1:0-1:82
dnsmichi commented 5 years ago

vars.sid2watch is undefined in this scope, and as such an empty value is assigned to parent_service_name. When it is empty, Icinga assumes that the parent object is of the Host type - whereas the filter type with OK is invalid.

apply Dependency "disable-ssh-checks-if-sid-down" to Service {
  parent_service_name = vars.sid2watch

In order to fix this, use service.vars.sid2watch which actually involves the service object in this scope.

czstrfr commented 5 years ago

Good afternoon Michael,

many thanks for super quick reply, appreciate that. Unfortunately I already tried that before with the same result:

[2019-06-19 14:03:55 +0200] critical/config: Error: Validation failed for object 'host1.example.net!oracle-tablespace-usage-monitoring.shdwhuqa!sid-up' of type 'Dependency'; Attribute 'states': State filter is invalid for host dependency.
Location: in /etc/icinga2/zones.d/global-templates/dependencies.conf: 8:3-8:17
/etc/icinga2/zones.d/global-templates/dependencies.conf(6): apply Dependency "sid-up" to Service {
/etc/icinga2/zones.d/global-templates/dependencies.conf(7):   parent_service_name = service.vars.sid2watch
/etc/icinga2/zones.d/global-templates/dependencies.conf(8):   states = ["OK"]
                                                              ^^^^^^^^^^^^^^^
/etc/icinga2/zones.d/global-templates/dependencies.conf(9):   disable_checks = true
/etc/icinga2/zones.d/global-templates/dependencies.conf(10):   disable_notifications = true
dnsmichi commented 5 years ago

Hm, ok. I'm little out of time now, but you can try to use the script debugger in order to analyse why and if the parent_service_name is not populated here.

apply Dependency "disable-ssh-checks-if-sid-down" to Service {
  parent_service_name = service.vars.sid2watch

  debugger //add the statement breakpoint

  states = ["OK"]
  disable_checks = true
  disable_notifications = true
  ignore_soft_states = false
  assign where service.check_command == "by_ssh" && host.vars.os == "Solaris" && host.vars.sids2watch
}

Invoke the script debugger.

icinga2 daemon -X -C 

Print this, service, and so on in the local scope. Maybe the service variable is empty for that specific object where the apply rule matches.

Hint: The assign where expression doesn't check for its existence.

  assign where service.check_command == "by_ssh" && host.vars.os == "Solaris" && host.vars.sids2watch
czstrfr commented 5 years ago

Hi Michael,

my apologies since there is already relevant open tiket under icingaweb2-module-director, which I think is not the right place since this is not director issue. Shall I close this one?

Many thanks for your help and great work.

Regards

Frantisek

dnsmichi commented 5 years ago

The linked issue is a feature request to allow object references in apply rules inside the Director - that's a different one. Your problem is with static config files and some of your services don't provide the data - as such the validation fails. Fix that, and it will work.

czstrfr commented 5 years ago

Dear Michael,

many thanks for your hints, but the variable stayed empty no matter what I tried. After bit of digging I found another elegant way how to automatize it, so I am sharing my findings for others since I am really loving this project and I want to participate tiny bit too:

object Host "solariszone1.example.net" {
  import "generic-host"
  display_name = "DWH Test1 QA"
  address = "10.20.30.40"
  vars.os = "Solaris"
  vars.parents = ["solarisldom1.example.net"]
  vars.sids2watch       ["dwhsid1"]={
                        procs_user = "oracle",
                        procs_warning = "",
                        procs_critical = "1:1",
                        procs_argument = "ora_pmon_dwhsid1"}
  vars.sids2watch       ["dwhsid2"]={
                        procs_user = "oracle",
                        procs_warning = "",
                        procs_critical = "1:1",
                        procs_argument = "ora_pmon_dwhsid2"}
}

apply Service "sids2watch" for (sidname => config in host.vars.sids2watch){
  import "solaris-checks-service"
  display_name = "oracle SID status: " + sidname
  vars.by_ssh_command = "/opt/icinga/nagios-plugins/check_procs"
  vars += config
  vars.by_ssh_arguments = {
                        "-w" = {
                                value = "$procs_warning$",
                                }
                        "-c" = {
                                value = "$procs_critical$",
                                }
                        "-u" = {
                                value = "$procs_user$",
                                }
                        "-a" = {
                                value = "$procs_argument$",
                                }
                          }
  assign where host.vars.sids2watch
}

apply Service "oracle-active-session-count-check" for (sidname => config in host.vars.sids2watch) {
  import "oracledb-check-service"
  vars.by_ssh_command = "sudo su - oracle /u01/icinga/oracle-active-session-count-check.sh " +sidname
  vars += config
  assign where host.vars.os == "Solaris" && host.vars.sids2watch
}

apply Dependency "Parent" for (parent in host.vars.parents) to Host {
  parent_host_name = parent
  assign where host.address && host.vars.parents
}

apply Dependency "disabel-ssh-checks-if-sid-down" for (sidname => config in host.vars.sids2watch) to Service {
  parent_service_name = "sids2watch" +sidname
  states = ["OK"]
  disable_checks = true
  disable_notifications = true
  ignore_soft_states = false
  assign where match("*"+sidname, service.name)
}