influxdata / telegraf

Agent for collecting, processing, aggregating, and writing metrics, logs, and other arbitrary data.
https://influxdata.com/telegraf
MIT License
14.51k stars 5.55k forks source link

Data sequence issue when combining multiple sensors into one metric. #6321

Closed mohsin106 closed 1 year ago

mohsin106 commented 5 years ago

Relevant telegraf.conf:

[global_tags]

[agent]
  round_interval = true
  metric_batch_size = 1000
  metric_buffer_limit = 10000
  collection_jitter = "0s"
  flush_interval = "5s"
  flush_jitter = "0s"
  debug = false
  quiet = false
  hostname = "telegraf-agent"
  omit_hostname = false

[[inputs.jti_openconfig_telemetry]]
  servers = ["routerone.domain.net:50051"]
  sample_frequency = "60000ms"
  username = "$routeruser"
  password = "$routerpass"
  client_id = "$containerName"

  sensors = [
   "interfacesAll /junos/system/linecard/interface/ /interfaces/interface/state/description/"
  ]

  ssl_cert = "/etc/telegraf/juniper_tls_cert.pem"
  str_as_tags = false

  fielddrop = ["/interfaces/interface/state/last-change","/interfaces/interface/init_time"]

[[processors.rename]]
  ## Specify one sub-table per rename operation.
  ##################### CLEAN UP INTERFACES SENSOR ###################################################    
  [[processors.rename.replace]]
    field = "/interfaces/interface/state/description"
    dest = "description"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/carrier-transitions"
    dest = "carrier-transitions"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/in-broadcast-pkts"
    dest = "in-broadcast-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/in-errors"
    dest = "in-errors"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/in-multicast-pkts"
    dest = "in-multicast-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/in-octets"
    dest = "in-octets"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/in-pkts"
    dest = "in-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/in-unicast-pkts"
    dest = "in-unicast-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-broadcast-pkts"
    dest = "out-broadcast-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-discards"
    dest = "out-discards"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-errors"
    dest = "out-errors"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-multicast-pkts"
    dest = "out-multicast-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-octets"
    dest = "out-octets"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-pkts"
    dest = "out-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-avg-buffer-occupancy"
    dest = "avg-buffer-occupancy"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-bytes"
    dest = "bytes"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-cur-buffer-occupancy"
    dest = "cur-buffer-occupancy"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-peak-buffer-occupancy"
    dest = "peak-buffer-occupancy"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-pkts"
    dest = "pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-red-drop-bytes"
    dest = "red-drop-bytes"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-red-drop-pkts"
    dest = "red-drop-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/-tail-drop-pkts"
    dest = "tail-drop-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-queue/allocated-buffer-size-ping"
    dest = "allocated-buffer-size-ping"

  [[processors.rename.replace]]
    tag = "/interfaces/interface/state/counters/out-queue/@queue-number"
    dest = "queue-number"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/counters/out-unicast-pkts"
    dest = "out-unicast-pkts"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/high-speed"
    dest = "high-speed"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/oper-status"
    dest = "oper-status"

  [[processors.rename.replace]]
    field = "/interfaces/interface/state/parent_ae_name"
    dest = "parent-ae-name"
  ##################### CLEAN UP INTERFACES SENSOR ###################################################    

  ##################### CLEAN UP INTERFACES DESC SENSOR ##############################################
  [[processors.rename.replace]]
    tag = "/interfaces/interface/@name"
    dest = "interface-name" 
  ##################### CLEAN UP INTERFACES DESC SENSOR ##############################################

  ##################### CLEAN UP CPU AND NPU SENSOR ##############################################
  [[processors.rename.replace]]
    tag = "/components/component/propertiesproperty/@name"
    dest = "property-name"

  [[processors.rename.replace]]
    tag = "/components/component/@name"
    dest = "component-name"

  [[processors.rename.replace]]
    field = "/components/component/propertiesproperty/state/value"
    dest = "property-value"
  ##################### CLEAN UP CPU AND NPU SENSOR ##############################################

  ##################### CLEAN UP LSPS SENSOR ##############################################
  [[processors.rename.replace]]
    field = "/mpls/lsps/constrained-path/tunnels/tunnel/state/counters/bytes"
    dest = "bytes"
  [[processors.rename.replace]]
    field = "/mpls/lsps/constrained-path/tunnels/tunnel/state/counters/packets"
    dest = "packets"    
  [[processors.rename.replace]]
    tag = "/mpls/lsps/constrained-path/tunnels/tunnel/@name"
    dest = "tunnel-name"
  [[processors.rename.replace]]
    tag = "/mpls/lsps/constrained-path/tunnels/tunnel/@source"
    dest = "tunnel-source"
  [[processors.rename.replace]]
    tag = "/mpls/lsps/constrained-path/tunnels/tunnel/state/counters/@name"
    dest = "tunnel-state-counters-name" 
  ##################### CLEAN UP LSPS SENSOR ##############################################

######################## Convert parentAeName, description to a tag ########################
[[processors.converter]]
  [processors.converter.fields]
    tag = ["parent-ae-name"]    
######################## Convert parentAeName, description to a tag ########################

[[outputs.influxdb]]
  namepass = ["interfacesAll"]
  urls = ["http://10.10.10.10:8086"         ]
  database = "telemetry_collection" # required
  precision = "s"
  write_consistency = "any"
  timeout = "5s"

System info:

Telegraf v1.11 running in Docker Juniper PTX 10K running NA 17.3

Steps to reproduce:

  1. Configure telegraf.conf to push interfaces and interface description sensor data to one metric
  2. Run container
  3. Execute this query: select "description", "out-octets" from interfacesAll

Expected behavior:

"out-octets" is from the interfaces sensor and "description" is from the interfaces description sensor. I was expecting to see values for both fields print out together for the same timestamp.

select "description", "out-octets" from interfacesAll
name: interfacesAll
time          description                                    out-octets
----          -----------                                    ----------
1566930757470 description data here                          2458526936904172
1566930757470 description data here                          2610511645004384
1566930757470 description data here                          1184936510558520
1566930757553 description data here                          3820847642682687

Actual behavior:

select "description", "out-octets" from interfacesAll
name: interfacesAll
time          description                                    out-octets
----          -----------                                    ----------
1566930757470                                                2458526936904172
1566930757470                                                2610511645004384
1566930757470                        1184936510558520
1566930757470                                         3820847642682687
1566930757553                                                
1566930757553                                                
1566930760963 description data for router
1566930760963 description data for router
1566930760963 description data for router
1566930760963 description data for router
1566930760963 description data for router

Additional info:

I'm looking to join the interfaces sensor data with interfaces description sensor data. I thought this method of combining two sensors into one metric would accomplish that.

When telegraf starts, is it trying to collect both sensor data at the same time concurrently?

daldoyle commented 4 years ago

We have been having trouble getting the JTI plugin to work well, and I think are seeing the same or a similar issue. I'm neither a Go developer nor a streaming telemetry expert by any stretch so my code-diving might be wrong, but I suspect it's related to this: https://github.com/influxdata/telegraf/blob/master/plugins/inputs/jti_openconfig_telemetry/openconfig_telemetry.go#L204

The timestamp sent back by the device appears to vary even between sequences sent in the same batch. If my response generates say 10 "sequences" this is problematic because each will have a slightly different timestamp and it appears that a "single" piece of data can bridge multiple sequences. Asking for "/network-instances/network-instance/protocols/protocol/bgp/neighbors/neighbor" as the sensor is a good example of this in my testing.

Below is a rough output I put together from debug logging. Note that the response for A.B.C.D IPv4_Unicast is broken up between sequence 75 and 76 and with a different timestamp. This makes reassembly impossible (?) in telegraf I think. I have tried the merge aggregator but since it factors in timestamp I don't think will work, but perhaps I didn't use it correctly.

Apologies - this is a bit long/ugly. If there's a better way to format I would be happy to do so.

sequence_number: 75
timestamp: 1587480906419
  key: __timestamp__
  uint_value: 1587480906693
  key: __junos_re_stream_creation_timestamp__
  uint_value: 1587480905730
  key: __junos_re_payload_get_timestamp__
  uint_value: 1587480906410
  key: __prefix__
  str_value: /network-instances/network-instance[instance-name='master']/
.....stuff earlier in sequence..........
  key: protocols/protocol/bgp/neighbors/neighbor[neighbor-address='A.B.C.D']/afi-safis/afi-safi[afi-safi-name='IPV4_UNICAST']/state/prefixes/received
  uint_value: 218
  key: protocols/protocol/bgp/neighbors/neighbor[neighbor-address='A.B.C.D']/afi-safis/afi-safi[afi-safi-name='IPV4_UNICAST']/state/prefixes/sent
  uint_value: 20804
sequence_number: 76
timestamp: 1587480906429
  key: __timestamp__
  uint_value: 1587480906786
  key: __junos_re_stream_creation_timestamp__
  uint_value: 1587480905730
  key: __junos_re_payload_get_timestamp__
  uint_value: 1587480906419
  key: __prefix__
  str_value: /network-instances/network-instance[instance-name='master']/
  key: protocols/protocol/bgp/neighbors/neighbor[neighbor-address='A.B.C.D']/afi-safis/afi-safi[afi-safi-name='IPV4_UNICAST']/state/prefixes/installed
  uint_value: 218
  key: protocols/protocol/bgp/neighbors/neighbor[neighbor-address='A.B.C.D']/afi-safis/afi-safi[afi-safi-name='IPV4_UNICAST']/state/prefixes/accepted
  uint_value: 218
srebhan commented 1 year ago

@mohsin106 does this issue still exist in current Telegraf versions?

mohsin106 commented 1 year ago

I haven't tried that sensor path in a while. I'm tied up with another project at the moment. We are trying to move away from the JTI plugin and use gNMI plugin instead.

srebhan commented 1 year ago

@mohsin106 so what should we do with this PR? If you cannot test anymore I'd say we close it!?

mohsin106 commented 1 year ago

@srebhan i agree we should close it. Maybe someone else who experiences this problem in the future can reopen if need be.