influxdata / telegraf

Agent for collecting, processing, aggregating, and writing metrics, logs, and other arbitrary data.
https://influxdata.com/telegraf
MIT License
14.65k stars 5.59k forks source link

Incorrect telegraf service termination during operating system reboot #14834

Closed Erikov-K closed 5 months ago

Erikov-K commented 8 months ago

Relevant telegraf.conf

[global_tags]

[agent]
  interval = "15s"
  round_interval = true
  metric_batch_size = 1000
  metric_buffer_limit = 10000
  collection_jitter = "0s"
  flush_interval = "15s"
  flush_jitter = "0s"
  precision = ""
  debug = true
  quiet = false
  logfile = "/Program Files/Telegraf/telegraf-metrics.log"
  logfile_rotation_max_size = "1MB"
  logfile_rotation_max_archives = 5
  hostname = ""
  omit_hostname = false

[[outputs.prometheus_client]]
  listen = ":9182"
  metric_version = 2

###############################################################################
#                            INPUT PLUGINS                                    #
###############################################################################

[[inputs.win_perf_counters]]

  [[inputs.win_perf_counters.object]]
    ObjectName = "Processor"
    Instances = ["*"]
    Counters = [
      "% Idle Time",
      "% Interrupt Time",
      "% Privileged Time",
      "% User Time",
      "% Processor Time",
      "% DPC Time",
    ]
    Measurement = "win_cpu"

  [[inputs.win_perf_counters.object]]
    ObjectName = "Memory"
    Instances = ["------"]
    Counters = [
      "Available Bytes",
      "Cache Faults/sec",
      "Demand Zero Faults/sec",
      "Page Faults/sec",
      "Pages/sec",
      "Transition Faults/sec",
      "Pool Nonpaged Bytes",
      "Pool Paged Bytes",
      "Standby Cache Reserve Bytes",
      "Standby Cache Normal Priority Bytes",
      "Standby Cache Core Bytes",
    ]
    Measurement = "win_mem"

  [[inputs.win_perf_counters.object]]
    ObjectName = "Paging File"
    Instances = ["_Total"]
    Counters = [
      "% Usage",
    ]
    Measurement = "win_swap"

  [[inputs.win_perf_counters.object]]
    ObjectName = "LogicalDisk"
    Instances = ["*"]
    Counters = [
      "% Idle Time",
      "% Disk Time",
      "% Disk Read Time",
      "% Disk Write Time",
      "% Free Space",
      "Current Disk Queue Length",
      "Free Megabytes",
    ]
    Measurement = "win_disk"

  [[inputs.win_perf_counters.object]]
    ObjectName = "PhysicalDisk"
    Instances = ["*"]
    Counters = [
      "Disk Read Bytes/sec",
      "Disk Write Bytes/sec",
      "Current Disk Queue Length",
      "Disk Reads/sec",
      "Disk Writes/sec",
      "% Disk Time",
      "% Disk Read Time",
      "% Disk Write Time",
    ]
    Measurement = "win_diskio"

  [[inputs.win_perf_counters.object]]
    ObjectName = "Network Interface"
    Instances = ["*"]
    Counters = [
      "Bytes Received/sec",
      "Bytes Sent/sec",
      "Packets Received/sec",
      "Packets Sent/sec",
      "Packets Received Discarded",
      "Packets Outbound Discarded",
      "Packets Received Errors",
      "Packets Outbound Errors",
    ]
    Measurement = "win_net"

  [[inputs.win_perf_counters.object]]
    ObjectName = "System"
    Instances = ["------"]
    Counters = [
      "Context Switches/sec",
      "System Calls/sec",
      "Processor Queue Length",
      "System Up Time",
    ]
    Measurement = "win_system"

 [[inputs.win_perf_counters.object]]
     ObjectName = "MSExchange ADAccess Domain Controllers"
     Counters = ["LDAP Read Time","LDAP Search Time","LDAP Searches Timed Out per Minute","Long Running LDAP Operations/min"]
     Instances = ["*"]
     Measurement = "msexch"

 [[inputs.win_perf_counters.object]]
    ObjectName = "MSExchangeTransport Queues"
    Counters = ["External Aggregate Delivery Queue Length (All External Queues)",
                "Internal Aggregate Delivery Queue Length (All Internal Queues)"]
    Instances = ["_total"]
    Measurement = "msexch_queue"

 [[inputs.win_perf_counters.object]]
    ObjectName = "MSExchange Replication"
    Counters = ["CopyQueueLength",
            "ReplayQueueLength",
            "FailedSuspended",
            "Failed",
            "Suspended"]
    Instances = ["_total"]
    Measurement = "msexch_repl"

 [[inputs.win_perf_counters.object]]
    ObjectName = "MSExchangeWS"
    Counters = ["Total Failed Subscriptions"]
    Instances = ["------"]
    Measurement = "msexch_ews"

 [[inputs.win_perf_counters.object]]
    ObjectName = "MSExchange OWA"
    Counters = ["Current Unique Users",
        "Requests/sec"]
    Instances = ["------"]
    Measurement = "msexch_owa"

 [[inputs.win_perf_counters.object]]
    ObjectName = "MSExchange Availability Service"
    Counters = ["Average Time to Process a Free Busy Request"]
    Instances = ["------"]
    Measurement = "msexch_FreeBusy"

 [[inputs.win_perf_counters.object]]
    ObjectName = "MSExchangeFrontEndTransport SmtpSend"
    Counters = ["Bytes Sent Total",
        "Messages Sent Total"]
    Instances = ["*"]
    Measurement = "msexch_FETransport"

 [[inputs.win_perf_counters.object]]
    ObjectName = "MSExchangeIS Store"
    Counters = ["RPC Average Latency",
        "RPC Requests",
        "RPC Operations/sec"]
    Instances = ["*"]
    Measurement = "msexch_ISStore"

[[inputs.win_services]]
  service_names = [
      "MSExchangeTransport",
      "MSExchangeTransportLogSearch",
      "MSExchangeHMRecovery",
      "MSExchangeIMAP4BE",
      "MSExchangeMitigation",
      "MSExchangePOP3BE",
      "MSExchangeThrottling",
      "MSExchangeServiceHost",
      "MSExchangeFastSearch",
      "MSExchangeRepl",
      "MSExchangeRPC",
      "MSExchangePop3",
      "MSExchangeSubmission", 
      "MSExchangeDelivery",
      "MSExchangeMailboxReplication",
      "MSExchangeMailboxAssistants",
      "MSExchangeIS", 
      "MSExchangeImap4",
      "MSExchangeHM",
      "MSExchangeFrontendTransport",
      "MSExchangeEdgeSync",
      "MSExchangeDiagnostics",
      "MSExchangeAntispamUpdate",
      "MSExchangeADTopology"
  ]

Logs from Telegraf

2024-02-16T10:31:28Z I! Starting Telegraf 1.28.5 brought to you by InfluxData the makers of InfluxDB
2024-02-16T10:31:28Z I! Available plugins: 240 inputs, 9 aggregators, 29 processors, 24 parsers, 59 outputs, 5 secret-stores
2024-02-16T10:31:28Z I! Loaded inputs: win_perf_counters win_services
2024-02-16T10:31:28Z I! Loaded aggregators: 
2024-02-16T10:31:28Z I! Loaded processors: 
2024-02-16T10:31:28Z I! Loaded secretstores: 
2024-02-16T10:31:28Z I! Loaded outputs: prometheus_client
2024-02-16T10:31:28Z I! Tags enabled: host=UK-N2-MBX02
2024-02-16T10:31:28Z I! [agent] Config: Interval:15s, Quiet:false, Hostname:"UK-N2-MBX02", Flush Interval:15s
2024-02-16T10:31:28Z D! [agent] Initializing plugins
2024-02-16T10:31:28Z D! [agent] Connecting outputs
2024-02-16T10:31:28Z D! [agent] Attempting connection to [outputs.prometheus_client]
2024-02-16T10:31:28Z I! [outputs.prometheus_client] Listening on http://[::]:9182/metrics
2024-02-16T10:31:28Z D! [agent] Successfully connected to outputs.prometheus_client
2024-02-16T10:31:28Z D! [agent] Starting service inputs
2024-02-16T10:31:32Z D! [inputs.win_perf_counters] Gathering from localhost
2024-02-16T10:31:32Z D! [inputs.win_perf_counters] Gathering from localhost finished in 512.5µs
2024-02-16T10:31:43Z D! [outputs.prometheus_client] Wrote batch of 62 metrics in 0s
2024-02-16T10:31:43Z D! [outputs.prometheus_client] Buffer fullness: 0 / 10000 metrics
2024-02-16T10:31:45Z D! [inputs.win_perf_counters] Gathering from localhost
2024-02-16T10:31:45Z D! [inputs.win_perf_counters] Gathering from localhost finished in 511.1µs
2024-02-16T10:31:58Z D! [outputs.prometheus_client] Wrote batch of 62 metrics in 0s
2024-02-16T10:31:58Z D! [outputs.prometheus_client] Buffer fullness: 0 / 10000 metrics
2024-02-16T10:32:00Z D! [inputs.win_perf_counters] Gathering from localhost
2024-02-16T10:32:00Z D! [inputs.win_perf_counters] Gathering from localhost finished in 511.7µs
2024-02-16T10:32:13Z D! [outputs.prometheus_client] Wrote batch of 62 metrics in 0s
2024-02-16T10:32:13Z D! [outputs.prometheus_client] Buffer fullness: 0 / 10000 metrics
2024-02-16T10:32:15Z D! [inputs.win_perf_counters] Gathering from localhost
2024-02-16T10:32:15Z D! [inputs.win_perf_counters] Gathering from localhost finished in 0s
2024-02-16T10:32:28Z D! [outputs.prometheus_client] Wrote batch of 62 metrics in 0s
2024-02-16T10:32:28Z D! [outputs.prometheus_client] Buffer fullness: 0 / 10000 metrics
2024-02-16T10:32:30Z D! [inputs.win_perf_counters] Gathering from localhost
2024-02-16T10:32:30Z D! [inputs.win_perf_counters] Gathering from localhost finished in 503.5µs
2024-02-16T10:32:43Z D! [outputs.prometheus_client] Wrote batch of 62 metrics in 0s
2024-02-16T10:32:43Z D! [outputs.prometheus_client] Buffer fullness: 0 / 10000 metrics
2024-02-16T10:32:46Z D! [inputs.win_perf_counters] Gathering from localhost
2024-02-16T10:32:46Z D! [inputs.win_perf_counters] Gathering from localhost finished in 511µs
2024-02-16T10:32:58Z D! [outputs.prometheus_client] Wrote batch of 62 metrics in 0s
2024-02-16T10:32:58Z D! [outputs.prometheus_client] Buffer fullness: 0 / 10000 metrics
2024-02-16T10:33:00Z D! [inputs.win_perf_counters] Gathering from localhost
2024-02-16T10:33:00Z D! [inputs.win_perf_counters] Gathering from localhost finished in 513.5µs
2024-02-16T10:33:10Z D! [agent] Stopping service inputs
2024-02-16T10:33:10Z D! [agent] Input channel closed
2024-02-16T10:33:10Z I! [agent] Hang on, flushing any cached metrics before shutdown
2024-02-16T10:33:10Z D! [outputs.prometheus_client] Wrote batch of 62 metrics in 1.3935ms
2024-02-16T10:33:10Z D! [outputs.prometheus_client] Buffer fullness: 0 / 10000 metrics
2024-02-16T10:33:10Z I! [agent] Stopping running outputs
2024-02-16T10:33:10Z D! [agent] Stopped Successfully

System info

Telegraf 1.28.5, Windows Server 2022, Exchange 2019 CU13

Docker

No response

Steps to reproduce

I have a strange issue with server hangs during operating system reboot at step "stopping services". My environment consist of following components:

When telegraf service running, server hangs during rebooting process at step "stopping services" for 10-15 minutes. In Windows System log at these time I can see, that different services goes into stopped state. Also I can see in telegraf log-file, that "I! [agent] Hang on, flushing any cached metrics before shutdown" and "I! [agent] Stopping running outputs". But Windows System log does not contain record about stopping telegraf service.

At this time if I connect to server (using Enter-PSSession) and run command get-service telegraf, I will see that telegraf-service state is Running. Then I run command stop-service telegraf and server will reboot immediately and at System event log I will see event-record about "The telegraf service entered the stopped state." But If I does not stop telegraf-service manually during server reboot process, server will be at state "stopping services" during 10-15 minutes...

State of Excahge services (running or stopped) does not affect on telegraf service state during system rebooting process.

Can you reproduce this case and give me answer, why telegraf service does-not stop their service correctly?

Expected behavior

Record saying us that telegraf service successfully stopped must exist at Windows System event log when operating system does to reboot process.

Actual behavior

When operating system receive command to reboot, telegraf log register record about termination of work, but telegraf service still continue running.

Additional info

No response

powersj commented 8 months ago

Hi,

Can you reproduce this case and give me answer, why telegraf service does-not stop their service correctly?

I do not have access to Windows Server or Exchange, so I will not be able to reproduce on a similar system. I do have access to Windows 11 and I do not see any difference in reboot time between before the service was installed, while the service was installed, and again after uninstalling the service.

My suggestion to you is to do the following:

  1. Does this reproduce on the CLI? Namely, run telegraf for few collection cycles and then hit Ctrl-C. Does it close relatively quickly?
  2. Start removing plugins from your config and see if you can narrow it down if one particular one is causing the issue. Start with your single output, switch to say [[outputs.file]] and see if that helps. Then start removing the perf counters one by one.
Erikov-K commented 8 months ago

Hi!

If I starting telegraf using CLI or PowerShell, it work as expected. I can terminate telegraf process using Ctrl + C and it terminating successfully.

Following your recommendations, I replaced output plugin to [[outputs.file]] (instead of [[outputs.prometheus_client]]) and then stay only one processor-object from [[inputs.win_perf_counters.object]].

Telegraf config file now looks like this:

[global_tags]

[agent] interval = "15s" round_interval = true metric_batch_size = 1000 metric_buffer_limit = 10000 collection_jitter = "0s" flush_interval = "15s" flush_jitter = "0s" precision = "" debug = true quiet = false logfile = "/Program Files/Telegraf/telegraf-metrics.log" logfile_rotation_max_size = "1MB" logfile_rotation_max_archives = 5 hostname = "" omit_hostname = false

[[outputs.file]] files = ["C:/Program Files/Telegraf/telegraf-metrics_outputs.file.txt"] data_format = "influx"

[[inputs.win_perf_counters]]

[[inputs.win_perf_counters.object]] ObjectName = "Processor" Instances = ["*"] Counters = [ "% Idle Time" ] Measurement = "win_cpu"

Then I started telegraf service and send to server reboot command (delayed for 30 seconds).

Below I write steps, which you can use to reproduce my case.

Step 1

[DateTime]::UtcNow.ToString('u') Get-Service -name telegraf-metrics | Stop-Service -PassThru -Verbose

Step 2

Change telegraf-config

Step 3

[DateTime]::UtcNow.ToString('u') Get-Service -name telegraf-metrics | Start-Service -PassThru -Verbose

Step 4

[DateTime]::UtcNow.ToString('u') Invoke-Expression -Command "shutdown /r /t 30"

Step 5

[DateTime]::UtcNow.ToString('u') Get-Service -name telegraf-metrics

Attachments:

Erikov-K commented 8 months ago

Hi!

I did some research.

I consistently rolled back the telegraf version to an earlier one and It was experimentally determined that starting from version v1.18.1 the problem is reproducible stably.

On version 1.18.0, the telegraf service is terminated correctly by the operating system and the operating system reboot process occurs without freezing. When the server is rebooted with telegraf service on version 1.18.0, Event 7036 appears in the operating system event log with information about the successful stop of the telegraf service “The Telegraf for Metrics service entered the stopped state.”

powersj commented 8 months ago

I am still not seeing this with your commands.

image

My system rebooted as expected after 30 seconds from issuing the reboot command.

Going to need something else to show that this is actually a telegraf issue.

Erikov-K commented 8 months ago

Hi!

The problem with rebooting operating system exist on server if MS Exchange installed on it. As you can see above (my previous message), on telegraf version below 1.18.1 problem with reboot does not exist.

What was changed from version 1.18.0 to version 1.18.1?

powersj commented 8 months ago

The problem with rebooting operating system exist on server if MS Exchange installed on it.

the config you provided did not even interact with exchange right?

What was changed from version 1.8.0 to version 1.8.1

that version is from 8 years ago :D and there appear to be no differences related to your report: https://github.com/influxdata/telegraf/compare/1.8.0...1.8.1

Erikov-K commented 8 months ago

Instead of laugh on me, you can install MS Exchange server role and you can reproduce this problem.

Telegraf service , running in this environment with "empty" config does not allow operating system do correct reboot. If I manually stop telegraf service before rebooting OS - all work as expected.

We can create video meeting where I will show you this strange case. Are you ready to investigate some time to it?

powersj commented 8 months ago

Instead of laugh on me, you can install MS Exchange server role and you can reproduce this problem.

I'm not sure why you think I am laughing at you, but I certainly was not.

Are you ready to investigate some time to it?

If you are going to be dismissive of this entire conversation and the backend forth we have had then I am happy to close this issue. I have both tried to reproduce this issue with what I have available to me and work to ask for additional details from you.

As I started off with, I do not have access to exchange.

I am not sure what else a video or a video call will provide in terms of resolving the issue. I need something to point to an actual issue in Telegraf itself.

You said this only occurs a) when telegraf is run as a service and b) when exchange is installed. However, that does not actually prove an issue with telegraf if an external service also needs to be installed which we are not interacting with. The minimum config you provided collects the processor stats, nothing from exchange.

If you remove the win_perf_counters and replace it with something like [[inputs.cpu]] does it still occur? That help answer if it is related to an input collecting metrics from Windows or if it is due to something else.

powersj commented 8 months ago

Telegraf service , running in this environment with "empty" config does not allow operating system do correct reboot.

Let me also ask how are you creating the service? Because this would also support the idea that telegraf itself is not at fault here and that something with how the service is managed or created is causing issues.

Erikov-K commented 8 months ago

If you remove the win_perf_counters and replace it with something like [[inputs.cpu]] does it still occur?

I replaced input plugin, as you recommended, and nothing was changed. :(

Let me also ask how are you creating the service?

Service installed by following command in powershell: . "C:\Program Files\Telegraf\telegraf-metrics.exe" --service install --config "C:\Program Files\Telegraf\telegraf-metrics.conf" --service-name="telegraf-metrics" --service-display-name="Telegraf for Metrics"

Erikov-K commented 8 months ago

I am trying to understand, why telegraf service, running on version 1.18.0 with the same "empty" telegraf config, which does not interact any Exchange metrics, does not prevent operating system rebooting process and successfully register record at system event log about "The Telegraf for Metrics service entered the stopped state.”

But if I stop telegraf service, replace them to version 1.18.1 and start service again, process of rebooting operating system will be waiting at state "stopping services" (system event log after reboot not contain record about telegraf service was stopped).

powersj commented 8 months ago

running on version 1.18.0

I see you updated your previous message ;) you previously said 1.8, not 1.18. The diff is a little different now: https://github.com/influxdata/telegraf/compare/v1.18.0...v1.18.1

There is one addition to the agent where we attempt to call close outputs. A way you can test if this is the issue is to update your [[outputs.file]] config. Instead of using a file to log, just use stderr. So instead of:

[[outputs.file]]
files = ["C:/Program Files/Telegraf/telegraf-metrics_outputs.file.txt"]
data_format = "influx"

use

[[outputs.file]]

Update the service with that new config and give it a try.

Thanks!

Erikov-K commented 8 months ago

Hi!

I apologize for misleading you. I'm already a little confused about which version of the telegraph everything works correctly in my case and which it doesn't.

Note: When I meant 1.18.0 - it was pre-release 1.18.0rc0. Because if you scroll list of releases you can not saw zip-archives for windows for version 1.18.0, only for 1.18.0rc0.

But if you click to the url of each release directly, there you can find zip-archive for windows.

I double-checked everything again by performing repeated tests and can see, that my problem start between 1.18.0rc0 and 1.18.0rc1. Both versions write records into operating system event log, where I can watch actual time when telegraf service was stopped.

1.18.0rc0 - quick reboot. 23.02.2024 11:29:11 - EventID 1074. "The process C:\Windows\system32\wbem\wmiprvse.exe (UK-N2-MBX02) has initiated the restart of computer UK-N2-MBX02..." 23.02.2024 11:29:15 - EventID 7036. "The Telegraf for Metrics service entered the stopped state." 23.02.2024 11:32:47 - EventLog 6005. "The Event log service was started."

1.18.0rc1 - long reboot. 23.02.2024 11:42:08 - EventID 1074. "The process C:\Windows\system32\wbem\wmiprvse.exe (UK-N2-MBX02) has initiated the restart of computer UK-N2-MBX02..." 23.02.2024 11:52:12 - EventID 7036. "The Telegraf for Metrics service entered the stopped state." 23.02.2024 11:55:43 - EventLog 6005. "The Event log service was started."

Do you have any ideas, which reason of case can be?

powersj commented 8 months ago

that my problem start between 1.18.0rc0 and 1.18.0rc1.

ok so here is the diff: https://github.com/influxdata/telegraf/compare/v1.18.0-rc0...v1.18.0-rc1

Some notable changes:

Can you provide the logs from telegraf-metrics.log for both those versions? I'm curious to see the final set of messages.

Erikov-K commented 8 months ago

Hi!

Can you provide the logs from telegraf-metrics.log for both those versions?

Yes, of course! I reproduced steps (start-service -> reboot) on 1.18.0rc0 and 1.18.0.rc1 witch same telegraf-config. After reboot was completed each time I saved in different archives collected telegraf-logs and system log messages (also include telegraf output and exe-file). As you can see below, telegraf log content are same on both versions, but system event log registered records about telegraf service was stopped are different.

Note: Telegraf use UTC-time for registering event at log-file, but operating system uses GMT+7 time.

1.18.0rc0

.\telegraf-metrics.exe --version Telegraf 1.18.0 (git: HEAD b9da09d0)

Messages from system log:

23.02.2024 23:31:24 EventID 7036 The Telegraf for Metrics service entered the running state. 23.02.2024 23:32:11 EventID 1074 The process C:\Windows\system32\wbem\wmiprvse.exe (UK-N2-MBX02) has initiated the restart of computer UK-N2-MBX02... 23.02.2024 23:32:17 EventID 7036 The Telegraf for Metrics service entered the stopped state. 23.02.2024 23:35:50 EventID 6005 The Event log service was started. 23.02.2024 23:35:50 EventID 6013 The system uptime is 66 seconds.

Messages from telegraf log:

2024-02-23T16:31:24Z I! Loaded inputs: cpu 2024-02-23T16:31:24Z I! Loaded aggregators: 2024-02-23T16:31:24Z I! Loaded processors: 2024-02-23T16:31:24Z I! Loaded outputs: file 2024-02-23T16:31:24Z I! Tags enabled: host=UK-N2-MBX02 2024-02-23T16:31:24Z I! [agent] Config: Interval:15s, Quiet:false, Hostname:"UK-N2-MBX02", Flush Interval:15s 2024-02-23T16:31:24Z D! [agent] Initializing plugins 2024-02-23T16:31:24Z D! [agent] Connecting outputs 2024-02-23T16:31:24Z D! [agent] Attempting connection to [outputs.file] 2024-02-23T16:31:24Z D! [agent] Successfully connected to outputs.file 2024-02-23T16:31:24Z D! [agent] Starting service inputs 2024-02-23T16:31:39Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T16:31:54Z D! [outputs.file] Wrote batch of 11 metrics in 0s 2024-02-23T16:31:54Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T16:32:09Z D! [outputs.file] Wrote batch of 11 metrics in 0s 2024-02-23T16:32:09Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T16:32:15Z D! [agent] Stopping service inputs 2024-02-23T16:32:15Z D! [agent] Input channel closed 2024-02-23T16:32:15Z I! [agent] Hang on, flushing any cached metrics before shutdown 2024-02-23T16:32:15Z D! [outputs.file] Wrote batch of 11 metrics in 0s 2024-02-23T16:32:15Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T16:32:15Z D! [agent] Stopped Successfully

1.18.0rc1

.\telegraf-metrics.exe --version Telegraf 1.18.0 (git: HEAD 14445d5f)

Messages from system log:

24.02.2024 0:00:10 EventID 7036 The Telegraf for Metrics service entered the running state. 24.02.2024 0:01:06 EventID 1074 The process C:\Windows\system32\wbem\wmiprvse.exe (UK-N2-MBX02) has initiated the restart of computer UK-N2-MBX02... 24.02.2024 0:11:11 EventID 7036 The Telegraf for Metrics service entered the stopped state. 24.02.2024 0:14:55 EventID 6005 The Event log service was started. 24.02.2024 0:14:56 EventID 6013 The system uptime is 66 seconds.

Messages from telegraf log:

2024-02-23T17:00:10Z I! Loaded inputs: cpu 2024-02-23T17:00:10Z I! Loaded aggregators: 2024-02-23T17:00:10Z I! Loaded processors: 2024-02-23T17:00:10Z I! Loaded outputs: file 2024-02-23T17:00:10Z I! Tags enabled: host=UK-N2-MBX02 2024-02-23T17:00:10Z I! [agent] Config: Interval:15s, Quiet:false, Hostname:"UK-N2-MBX02", Flush Interval:15s 2024-02-23T17:00:10Z D! [agent] Initializing plugins 2024-02-23T17:00:10Z D! [agent] Connecting outputs 2024-02-23T17:00:10Z D! [agent] Attempting connection to [outputs.file] 2024-02-23T17:00:10Z D! [agent] Successfully connected to outputs.file 2024-02-23T17:00:10Z D! [agent] Starting service inputs 2024-02-23T17:00:25Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T17:00:40Z D! [outputs.file] Wrote batch of 11 metrics in 514.7µs 2024-02-23T17:00:40Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T17:00:55Z D! [outputs.file] Wrote batch of 11 metrics in 0s 2024-02-23T17:00:55Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T17:01:09Z D! [agent] Stopping service inputs 2024-02-23T17:01:09Z D! [agent] Input channel closed 2024-02-23T17:01:09Z I! [agent] Hang on, flushing any cached metrics before shutdown 2024-02-23T17:01:09Z D! [outputs.file] Wrote batch of 11 metrics in 0s 2024-02-23T17:01:09Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics 2024-02-23T17:01:09Z D! [agent] Stopped Successfully

Attachments:

powersj commented 8 months ago

2024-02-23T16:32:15Z D! [agent] Stopped Successfully 2024-02-23T17:01:09Z D! [agent] Stopped Successfully

Both of these messages from the telegraf binary appear in a timely fashion. Meaning telegraf got the signal to shutdown, and completed the shutdown steps successfully.

For v1.18.0rc1, if I combine the logs to make it easier to see the delay:

24.02.2024 0:01:06     EventID 1074    The process C:\Windows\system32\wbem\wmiprvse.exe (UK-N2-MBX02) has initiated the restart of computer UK-N2-MBX02...
2024-02-23T17:01:09Z D! [agent] Stopping service inputs
2024-02-23T17:01:09Z D! [agent] Input channel closed
2024-02-23T17:01:09Z I! [agent] Hang on, flushing any cached metrics before shutdown
2024-02-23T17:01:09Z D! [outputs.file] Wrote batch of 11 metrics in 0s
2024-02-23T17:01:09Z D! [outputs.file] Buffer fullness: 0 / 10000 metrics
2024-02-23T17:01:09Z D! [agent] Stopped Successfully
24.02.2024 0:11:11     EventID 7036    The Telegraf for Metrics service entered the stopped state.
24.02.2024 0:14:55     EventID 6005    The Event log service was started.
24.02.2024 0:14:56     EventID 6013    The system uptime is 66 seconds.

There are no changes in the diff between those versions, other than the Go version, that I can think of that would cause some sort of delay. Given you said this only occurs with Exchange, my hunch is with the way the service is created or run or something else on the system, like a virus scanner, is causing issues for you.

Questions:

1) Instead of using Telegraf to create the service, have you tried using sc.exe and/or something like nssm. 2) do you have a virus scanner or something similar on the system? Does it report issues? Can you disable it?

ZnakVA commented 8 months ago

Hi, @powersj! I'm researching this problem too. Answers to your latest questions:

  1. Virus scanner is not installed on this server.
  2. Creating a service Telegraf via SC.exe does not change anything.
powersj commented 8 months ago

@ZnakVA,

Can you provide some details on your system? Windows version? Do you also have exchange installed? or any other services?

How did you come to determine it was telegraf service getting hung?

ZnakVA commented 8 months ago

@powersj,

All details were provided earlier by Erikov-K. We are researching the one and the same problem together.

powersj commented 8 months ago

We are researching the one and the same problem together.

Ah you didn't mention that :)

My only remaining thought would be that it has to do with a change to Go itself, as between those two versions we upgraded to go v1.16. You could try building telegraf v1.18rc0 with go v1.16 and if it reproduces that would mean this is something with upstream go's service management code.

Otherwise, without the ability to reproduce or any additonal logs I am out of ideas.

ZnakVA commented 7 months ago

@powersj,

I compiled two different executables using different versions of Go (1.16 and 1.15.8) from the same sources for telegraf-1.18.0-rc0, the result is:

powersj commented 7 months ago

with Go version 1.16 - long reboot with Go version 1.15.8 - fast reboot

Thank for trying that out. That would point at this being a change in the upstream Go library that we use to create services. The next step would be to look at what changed between those versions with respect to the service calls we make.

ZnakVA commented 7 months ago

Additionally,

ZnakVA commented 6 months ago

Additionally,

srebhan commented 5 months ago

@Erikov-K and @ZnakVA, I reworked the service handling and we now should have more control over the whole process. Can you please try the binary in PR #15372 and let me know how this behaves on your system?

ZnakVA commented 4 months ago

Hello @srebhan, @powersj! I'm sorry, I didn't have a chance to check earlier. With the new version binary in PR https://github.com/influxdata/telegraf/pull/15372 - rebooting is fast!!! Thank you very much for your support!!! In which official version of Telegraf can we expect these changes?

srebhan commented 4 months ago

The fix was merged already and released in v1.31.0 (check the milestone in the PR) which was published 2 weeks ago. ;-) Thanks for testing!