influxdata / telegraf

Agent for collecting, processing, aggregating, and writing metrics, logs, and other arbitrary data.
https://influxdata.com/telegraf
MIT License
14.89k stars 5.6k forks source link

[BUG] Input procstat missing out on collection interval (1s) #2315

Closed discoduck2x closed 7 years ago

discoduck2x commented 7 years ago

Versions: influxdb 1.1.1 , telegraf 1.1.2

telegraf.conf:

interval = "1s"

[[inputs.procstat]] pattern = "influxdb"

server is not under heavy load, just trying to get cpu usage from the influxdb process , not getting cpu usage every second but rather here and there as show in picture:

image

sparrc commented 7 years ago

do you have collection jitter set? can you provide your full config file? can you try using a pidfile and see if you get the same result?

discoduck2x commented 7 years ago

@sparrc it seems to me to be related to values below 1 , as this grafana graph shows , the top is distinct cpu_usage values, the bottom is the count - both grouped by time 1second. If i refresh grafana manually thus inducing load on the influxdb process then i get one sample every second,, but then the influxdb process is idling - there are gaps. telegraf.conf below image:

image

# Global tags can be specified here in key="value" format.
[global_tags]
  # dc = "us-east-1" # will tag all metrics with dc=us-east-1
  # rack = "1a"
  ## Environment variables can be used as tags, and throughout the config file
  # user = "$USER"

# Configuration for telegraf agent
[agent]
  ## Default data collection interval for all inputs
  interval = "1s"
  ## Rounds collection interval to 'interval'
  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
  round_interval = true

  ## Telegraf will send metrics to outputs in batches of at most
  ## metric_batch_size metrics.
  ## This controls the size of writes that Telegraf sends to output plugins.
  metric_batch_size = 1000

  ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
  ## output, and will flush this buffer on a successful write. Oldest metrics
  ## are dropped first when this buffer fills.
  ## This buffer only fills when writes fail to output plugin(s).
  metric_buffer_limit = 10000

  ## Collection jitter is used to jitter the collection by a random amount.
  ## Each plugin will sleep for a random time within jitter before collecting.
  ## This can be used to avoid many plugins querying things like sysfs at the
  ## same time, which can have a measurable effect on the system.
  collection_jitter = "0s"

  ## Default flushing interval for all outputs. You shouldn't set this below
  ## interval. Maximum flush_interval will be flush_interval + flush_jitter
  flush_interval = "10s"
  ## Jitter the flush interval by a random amount. This is primarily to avoid
  ## large write spikes for users running a large number of telegraf instances.
  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
  flush_jitter = "0s"

  ## By default, precision will be set to the same timestamp order as the
  ## collection interval, with the maximum being 1s.
  ## Precision will NOT be used for service inputs, such as logparser and statsd.
  ## Valid values are "ns", "us" (or "µs"), "ms", "s".
  precision = ""

  ## Logging configuration:
  ## Run telegraf with debug log messages.
  debug = false
  ## Run telegraf in quiet mode (error log messages only).
  quiet = false
  ## Specify the log file name. The empty string means to log to stderr.
  logfile = ""

  ## Override default hostname, if empty use os.Hostname()
  hostname = ""
  ## If set to true, do no set the "host" tag in the telegraf agent.
  omit_hostname = false

###############################################################################
#                            OUTPUT PLUGINS                                   #
###############################################################################

# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
  ## The full HTTP or UDP endpoint URL for your InfluxDB instance.
  ## Multiple urls can be specified as part of the same cluster,
  ## this means that only ONE of the urls will be written to each interval.
  # urls = ["udp://localhost:8089"] # UDP endpoint example
  urls = ["http://localhost:8086"] # required
  ## The target database for metrics (telegraf will create it if not exists).
  database = "servers_internal" # required

  ## Retention policy to write to. Empty string writes to the default rp.
  retention_policy = ""
  ## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
  write_consistency = "any"

  ## Write timeout (for the InfluxDB client), formatted as a string.
  ## If not provided, will default to 5s. 0s means no timeout (not recommended).
  timeout = "5s"
  # username = "telegraf"
  # password = "metricsmetricsmetricsmetrics"
  ## Set the user agent for HTTP POSTs (can be useful for log differentiation)
  # user_agent = "telegraf"
  ## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
  # udp_payload = 512

  ## Optional SSL Config
  # ssl_ca = "/etc/telegraf/ca.pem"
  # ssl_cert = "/etc/telegraf/cert.pem"
  # ssl_key = "/etc/telegraf/key.pem"
  ## Use SSL but skip chain & host verification
  # insecure_skip_verify = false

###############################################################################
#                            INPUT PLUGINS                                    #
###############################################################################

# Read metrics about cpu usage
[[inputs.cpu]]
  ## Whether to report per-cpu stats or not
  percpu = true
  ## Whether to report total system cpu stats or not
  totalcpu = true
  ## If true, collect raw CPU time metrics.
  collect_cpu_time = false

# Read metrics about disk usage by mount point
#[[inputs.disk]]
  ## By default, telegraf gather stats for all mountpoints.
  ## Setting mountpoints will restrict the stats to the specified mountpoints.
  # mount_points = ["/"]

  ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
  ## present on /run, /var/run, /dev/shm or /dev).
 # ignore_fs = ["tmpfs", "devtmpfs"]

# Read metrics about disk IO by device
#[[inputs.diskio]]
  ## By default, telegraf will gather stats for all devices including
  ## disk partitions.
  ## Setting devices will restrict the stats to the specified devices.
  # devices = ["sda", "sdb"]
  ## Uncomment the following line if you need disk serial numbers.
  # skip_serial_number = false

# Get kernel statistics from /proc/stat
#[[inputs.kernel]]
  # no configuration

# Read metrics about memory usage
#[[inputs.mem]]
  # no configuration

# Get the number of processes and group them by status
#[[inputs.processes]]
  # no configuration

# Read metrics about swap memory usage
#[[inputs.swap]]
  # no configuration

# Read metrics about system load & uptime
#[[inputs.system]]
  # no configuration

#[[inputs.procstat]]
#pattern = "logstash"
#fielddrop = ["cpu_time_*","i*","m*","n*","p*","r*","v*","w*"]

#[[inputs.procstat]]
#pattern = "elasticsearch"
#fielddrop = ["cpu_time_*","i*","m*","n*","p*","r*","v*","w*"]

#[[inputs.procstat]]
#pattern = "telegraf"
#fielddrop = ["cpu_time_*","i*","m*","n*","p*","r*","v*","w*"]

[[inputs.procstat]]
pattern = "influxdb"
#fielddrop = ["cpu_time_*","i*","m*","n*","p*","r*","v*","w*"]
discoduck2x commented 7 years ago

@sparrc - same result with pid/exe options aswell