treasure-data / omnibus-td-agent

td-agent (Fluentd) Packaging Scripts
https://docs.treasuredata.com/articles/td-agent-changelog
Apache License 2.0
82 stars 131 forks source link

td-agent take 2 minutes to stop if influxDB ip is incorrect #281

Open bhakta0007 opened 3 years ago

bhakta0007 commented 3 years ago
root@bhakta-edge1:/lib/systemd/system# dpkg -l | grep td-
ii  td-agent                              3.8.1-0                            amd64        Treasure Agent: A data collector for Treasure Data

service td-agent stop or service td-agent restart on ubuntu take 2 minutes for service to stop when I have incorrect IP address for influx DB host.

<match app.agent>
  @type influxdb
  host  192.168.31.31
  port  8086
  dbname at_logs
  user  fluentd
  password  maskedPassword@123
  measurement my_measurement
  use_ssl false
  time_precision ns
  tag_keys ["levelname", "host"]
  sequence_tag _seq
  <buffer>
    @type memory
    chunk_limit_records 1024
    flush_interval 30
    retry_wait 1.0
  </buffer>
</match>

strace of the process shows it's waiting to connect (tried this right after starting the td-agent with the above invalid influxdb server config).

write(2, "W, [2021-04-11T11:10:04.063194 #"..., 253) = 253
futex(0x7f28ea84b0d8, FUTEX_WAIT_PRIVATE, 0, {tv_sec=29, tv_nsec=999998676}) = -1 ETIMEDOUT (Connection timed out)
futex(0x7f28ea84b128, FUTEX_WAKE_PRIVATE, 1) = 0
clone(child_stack=0x7f28e8973f70, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f28e89749d0, tls=0x7f28e8974700, child_tidptr=0x7f28e89749d0) = 797
socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_TCP) = 10
futex(0x7f28ea84f068, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 673
getpid()                                = 673
write(6, "!", 1)                        = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 0
connect(10, {sa_family=AF_INET, sin_port=htons(8086), sin_addr=inet_addr("192.168.31.31")}, 16) = -1 ECONNREFUSED (Connection refused)
close(10)                               = 0
futex(0x7f28e39f9cd8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28e39f9d28, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f068, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 673
write(2, "W, [2021-04-11T11:10:34.068623 #"..., 253) = 253
futex(0x7f28ea84b0d8, FUTEX_WAIT_PRIVATE, 0, {tv_sec=29, tv_nsec=999998803}) = ? ERESTART_RESTARTBLOCK (Interrupted by signal)
--- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=660, si_uid=113} ---
getpid()                                = 673
getpid()                                = 673
write(4, "!", 1)                        = 1
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
futex(0x7f28ea84b128, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f28ea84b0dc, FUTEX_WAIT_PRIVATE, 0, {tv_sec=29, tv_nsec=200914940}) = -1 ETIMEDOUT (Connection timed out)
futex(0x7f28ea84b128, FUTEX_WAKE_PRIVATE, 1) = 0
clone(child_stack=0x7f28e8973f70, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f28e89749d0, tls=0x7f28e8974700, child_tidptr=0x7f28e89749d0) = 820
socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_TCP) = 10
futex(0x7f28ea84f06c, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 673
getpid()                                = 673
write(6, "!", 1)                        = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 0
connect(10, {sa_family=AF_INET, sin_port=htons(8086), sin_addr=inet_addr("192.168.31.31")}, 16) = -1 ECONNREFUSED (Connection refused)
close(10)                               = 0
futex(0x7f28e38760d8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28e3876128, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f06c, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f28ea84f010, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 673
write(2, "W, [2021-04-11T11:11:04.073785 #"..., 253) = 253
futex(0x7f28ea84b0dc, FUTEX_WAIT_PRIVATE, 0, {tv_sec=29, tv_nsec=999998510}) = ?
+++ killed by SIGKILL +++

And then finally its blown up by sigkill due to the

TimeoutStopSec=120 that is present in the /lib/systemd/system/td-agent.service