influxdata / telegraf

Agent for collecting, processing, aggregating, and writing metrics, logs, and other arbitrary data.
https://influxdata.com/telegraf
MIT License
14.59k stars 5.56k forks source link

input.vsphere: Error in discovery #4824

Closed lee-costa closed 6 years ago

lee-costa commented 6 years ago

Relevant telegraf.conf:

    # Read metrics from one or many vCenters
    [[inputs.vsphere]]
        ## List of vCenter URLs to be monitored. These three lines must be uncommented
      ## and edited for the plugin to work.
      vcenters = [ "https://myip/sdk" ]
      username = "user@corp.local"
      password = "paswword"

     ## VMs
      ## Typical VM metrics (if omitted or empty, all metrics are collected)
      vm_metric_include = [
        "cpu.demand.average",
        "cpu.idle.summation",
        "cpu.latency.average",
        "cpu.readiness.average",
        "cpu.ready.summation",
        "cpu.run.summation",
        "cpu.usagemhz.average",
        "cpu.used.summation",
        "cpu.wait.summation",
        "mem.active.average",
        "mem.granted.average",
        "mem.latency.average",
        "mem.swapin.average",
        "mem.swapinRate.average",
        "mem.swapout.average",
        "mem.swapoutRate.average",
        "mem.usage.average",
        "mem.vmmemctl.average",
        "net.bytesRx.average",
        "net.bytesTx.average",
        "net.droppedRx.summation",
        "net.droppedTx.summation",
        "net.usage.average",
        "power.power.average",    
        "virtualDisk.numberReadAveraged.average",
        "virtualDisk.numberWriteAveraged.average",
        "virtualDisk.read.average",
        "virtualDisk.readOIO.latest",
        "virtualDisk.throughput.usage.average",
        "virtualDisk.totalReadLatency.average",
        "virtualDisk.totalWriteLatency.average",
        "virtualDisk.write.average",
        "virtualDisk.writeOIO.latest",
        "sys.uptime.latest",
      ]
      # vm_metric_exclude = [] ## Nothing is excluded by default
      # vm_instances = true ## true by default

      ## Hosts 
      ## Typical host metrics (if omitted or empty, all metrics are collected)
      host_metric_include = [
        "cpu.coreUtilization.average",
        "cpu.costop.summation",
        "cpu.demand.average",
        "cpu.idle.summation",
        "cpu.latency.average",
        "cpu.readiness.average",
        "cpu.ready.summation",
        "cpu.swapwait.summation",
        "cpu.usage.average",
        "cpu.usagemhz.average",
        "cpu.used.summation",
        "cpu.utilization.average",
        "cpu.wait.summation",
        "disk.deviceReadLatency.average",
        "disk.deviceWriteLatency.average",
        "disk.kernelReadLatency.average",
        "disk.kernelWriteLatency.average",
        "disk.numberReadAveraged.average",
        "disk.numberWriteAveraged.average",
        "disk.read.average",
        "disk.totalReadLatency.average",
        "disk.totalWriteLatency.average",
        "disk.write.average",
        "mem.active.average",
        "mem.latency.average",
        "mem.state.latest",
        "mem.swapin.average",
        "mem.swapinRate.average",
        "mem.swapout.average",
        "mem.swapoutRate.average",
        "mem.totalCapacity.average",
        "mem.usage.average",
        "mem.vmmemctl.average",
        "net.bytesRx.average",
        "net.bytesTx.average",
        "net.droppedRx.summation",
        "net.droppedTx.summation",
        "net.errorsRx.summation",
        "net.errorsTx.summation",
        "net.usage.average",
        "power.power.average",
        "storageAdapter.numberReadAveraged.average",
        "storageAdapter.numberWriteAveraged.average",
        "storageAdapter.read.average",
        "storageAdapter.write.average",
        "sys.uptime.latest",
      ]
      # host_metric_exclude = [] ## Nothing excluded by default
      # host_instances = true ## true by default

      ## Clusters 
      # cluster_metric_include = [] ## if omitted or empty, all metrics are collected
      # cluster_metric_exclude = [] ## Nothing excluded by default
      # cluster_instances = true ## true by default

      ## Datastores 
      # datastore_metric_include = [] ## if omitted or empty, all metrics are collected
      # datastore_metric_exclude = [] ## Nothing excluded by default
      # datastore_instances = false ## false by default for Datastores only

      ## Datacenters
      datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
      datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
      # datacenter_instances = false ## false by default for Datastores only

      ## Plugin Settings  
      ## separator character to use for measurement and field names (default: "_")
      # separator = "_"

      ## number of objects to retreive per query for realtime resources (vms and hosts)
      ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
      # max_query_objects = 256

      ## number of metrics to retreive per query for non-realtime resources (clusters and datastores)
      ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
      # max_query_metrics = 256

      ## number of go routines to use for collection and discovery of objects and metrics
      # collect_concurrency = 1
      # discover_concurrency = 1

      ## whether or not to force discovery of new objects on initial gather call before collecting metrics
      ## when true for large environments this may cause errors for time elapsed while collecting metrics
      ## when false (default) the first collection cycle may result in no or limited metrics while objects are discovered
      # force_discover_on_init = false

      ## the interval before (re)discovering objects subject to metrics collection (default: 300s)
      # object_discovery_interval = "300s"

      ## timeout applies to any of the api request made to vcenter
      # timeout = "20s"

      ## Optional SSL Config
      # ssl_ca = "/path/to/cafile"
      # ssl_cert = "/path/to/certfile"
      # ssl_key = "/path/to/keyfile"
      ## Use SSL but skip chain & host verification
       insecure_skip_verify = true

System info:

Ubuntu 16.04, Telegraf 1.8.1

[Include Telegraf version, operating system name, and other relevant details]

Steps to reproduce:

  1. ...Start telegraf service
  2. ...

Expected behavior:

No errors

Actual behavior:

It works but log display this error:

[input.vsphere]: Error in discovery for 192.168.139.130: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e

Additional info:

[Include gist of relevant config, logs, etc.]

    Oct 07 13:47:20 ubuntuTEL telegraf[12578]: 2018-10-07T17:47:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 13:52:20 ubuntuTEL telegraf[12578]: 2018-10-07T17:52:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 13:57:20 ubuntuTEL telegraf[12578]: 2018-10-07T17:57:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 14:02:20 ubuntuTEL telegraf[12578]: 2018-10-07T18:02:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 14:07:20 ubuntuTEL telegraf[12578]: 2018-10-07T18:07:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 14:12:20 ubuntuTEL telegraf[12578]: 2018-10-07T18:12:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 14:17:20 ubuntuTEL telegraf[12578]: 2018-10-07T18:17:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 14:22:20 ubuntuTEL telegraf[12578]: 2018-10-07T18:22:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
    Oct 07 14:27:20 ubuntuTEL telegraf[12578]: 2018-10-07T18:27:20Z E! [input.vsphere]: Error in discovery for VCENTER_IP_ADDRESS: ServerFaultCode: The object 'vim.view.ContainerView:session[527efa2a-f72f-7390-c6fe-eed26612f30b]52b003ca-f94f-2bb9-f35e
prydin commented 6 years ago

Could you please run it with the -debug flag and send me the logs? My initial guess is that some object in vCenter is corrupted. In that case, the error is relatively benign, since you're only going to miss data for that particular object (which is probably broken anyway).

Debug logs will greatly help me determine what's going on here.

lee-costa commented 6 years ago

Turns out a simple restart of Telegraf fixed the issue and have yet to reappear.

I will follow up with the debug flag if it shows up again.

Thank you.