Closed kingnarmer closed 1 year ago
Here is strace output of fluentbit while issue is taking place.
[pid 4276] epoll_wait(8, [], 256, 0) = 0
[pid 4276] epoll_wait(8, <unfinished ...>
[pid 4259] <... clock_nanosleep resumed>0x7ffe585ffa00) = 0
[pid 4319] <... clock_nanosleep resumed>0x7fbd84ff4990) = 0
[pid 4259] clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, <unfinished ...>
[pid 4319] clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, <unfinished ...>
[pid 4320] <... epoll_wait resumed>[{EPOLLIN, {u32=2208346240, u64=140451933892736}}], 256, -1) = 1
[pid 4320] accept4(85, {sa_family=AF_INET, sin_port=htons(33088), sin_addr=inet_addr("10.42.6.245")}, [16], SOCK_CLOEXEC|SOCK_NONBLOCK) = 196
[pid 4320] epoll_ctl(82, EPOLL_CTL_ADD, 196, {EPOLLIN|EPOLLERR|EPOLLHUP|EPOLLRDHUP, {u32=2208661504, u64=140451934208000}}) = 0
[pid 4320] epoll_wait(82, [{EPOLLIN, {u32=2208661504, u64=140451934208000}}], 256, -1) = 1
[pid 4320] recvfrom(196, "GET /api/v1/metrics/prometheus H"..., 4096, 0, NULL, NULL) = 256
[pid 4320] lstat("/api/v1/metrics/prometheus", 0x7fbd847f3260) = -1 ENOENT (No such file or directory)
[pid 4320] writev(196, [{iov_base="HTTP/1.1 200 OK\r\n", iov_len=17}, {iov_base="Server: Monkey/1.7.0\r\nDate: Sun,"..., iov_len=59}, {iov_base="Transfer-Encoding: chunked\r\n", iov_len=28}], 3) = 104
[pid 4320] writev(196, [{iov_base="Content-Type: text/plain; versio"..., iov_len=41}, {iov_base="\r\n", iov_len=2}], 2) = 43
[pid 4320] sendto(196, "CAF\r\n", 5, 0, NULL, 0) = 5
[pid 4320] epoll_ctl(82, EPOLL_CTL_DEL, 196, NULL) = 0
[pid 4320] sendto(196, "# HELP fluentbit_filter_add_reco"..., 3247, 0, NULL, 0) = 3247
[pid 4320] sendto(196, "\r\n", 2, 0, NULL, 0) = 2
[pid 4320] epoll_ctl(82, EPOLL_CTL_ADD, 196, {EPOLLOUT|EPOLLERR|EPOLLHUP|EPOLLRDHUP, {u32=2208661504, u64=140451934208000}}) = 0
[pid 4320] epoll_wait(82, [{EPOLLOUT, {u32=2208661504, u64=140451934208000}}], 256, -1) = 1
[pid 4320] epoll_ctl(82, EPOLL_CTL_DEL, 196, NULL) = 0
[pid 4320] sendto(196, "0\r\n\r\n", 5, 0, NULL, 0) = 5
[pid 4320] epoll_ctl(82, EPOLL_CTL_ADD, 196, {EPOLLOUT|EPOLLERR|EPOLLHUP|EPOLLRDHUP, {u32=2208661504, u64=140451934208000}}) = 0
[pid 4320] epoll_wait(82, [{EPOLLOUT, {u32=2208661504, u64=140451934208000}}], 256, -1) = 1
[pid 4320] epoll_ctl(82, EPOLL_CTL_DEL, 196, NULL) = 0
[pid 4320] epoll_ctl(82, EPOLL_CTL_ADD, 196, {EPOLLIN|EPOLLERR|EPOLLHUP|EPOLLRDHUP, {u32=2208661504, u64=140451934208000}}) = 0
[pid 4320] epoll_wait(82, <unfinished ...>
[pid 4276] <... epoll_wait resumed>[{EPOLLIN, {u32=2309334320, u64=140452034880816}}], 256, -1) = 1
[pid 4276] read(30, "8\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0", 16) = 16
[pid 4276] fstat(110, {st_mode=S_IFREG|0640, st_size=66118024, ...}) = 0
[pid 4276] fstat(110, {st_mode=S_IFREG|0640, st_size=66118024, ...}) = 0
[pid 4276] read(110, "{\"log\":\"{\\\"timestamp\\\": \\\"2022-1"..., 999999) = 705
[pid 4276] openat(AT_FDCWD, "/buffers/tail.0/1-1666556948.871454490.flb", O_RDWR|O_CREAT, 0600) = 235
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1620966, ...}) = 0
[pid 4276] mmap(NULL, 1622016, PROT_READ|PROT_WRITE, MAP_SHARED, 235, 0) = 0x7fbd87072000
[pid 4276] fallocate(235, 0, 0, 1654784) = 0
[pid 4276] mremap(0x7fbd87072000, 1620966, 1654784, MREMAP_MAYMOVE) = 0x7fbd8626b000
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1654784, ...}) = 0
[pid 4276] ftruncate(235, 1622547) = 0
[pid 4276] mremap(0x7fbd8626b000, 1654784, 1622547, MREMAP_MAYMOVE) = 0x7fbd8626b000
[pid 4276] msync(0x7fbd8626b000, 1622547, MS_ASYNC) = 0
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1622547, ...}) = 0
[pid 4276] munmap(0x7fbd8626b000, 1622547) = 0
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1622547, ...}) = 0
[pid 4276] close(235) = 0
[pid 4276] lseek(28, 3988192, SEEK_SET) = 3988192
[pid 4276] write(28, "\0\0\0\6\0\0\0\6\r\351\340\0367\25\305\222\321\10\236\7a\371\247\275", 24) = 24
[pid 4276] lseek(28, 3988216, SEEK_SET) = 3988216
[pid 4276] write(28, "\r\2\243\0\24\1\346\1\1\346\16\231\r\351\r@\f\247\f\17\vs\n\316\n\21\t[\10\257\7\372"..., 4096) = 4096
[pid 4276] fstat(110, {st_mode=S_IFREG|0640, st_size=66118024, ...}) = 0
[pid 4276] fstat(110, {st_mode=S_IFREG|0640, st_size=66118024, ...}) = 0
[pid 4276] write(26, "\2\300\0\0\0\0\0\0", 8) = 8
[pid 4276] epoll_wait(8, [{EPOLLIN, {u32=2309335280, u64=140452034881776}}], 256, 0) = 1
[pid 4276] fstat(31, {st_mode=S_IFREG|0640, st_size=4403, ...}) = 0
[pid 4276] fstat(32, {st_mode=S_IFREG|0640, st_size=13744, ...}) = 0
[pid 4276] fstat(33, {st_mode=S_IFREG|0640, st_size=7961, ...}) = 0
[pid 4276] fstat(34, {st_mode=S_IFREG|0640, st_size=2920775, ...}) = 0
[pid 4276] fstat(35, {st_mode=S_IFREG|0640, st_size=2400, ...}) = 0
[pid 4276] fstat(36, {st_mode=S_IFREG|0640, st_size=2402, ...}) = 0
[pid 4276] fstat(38, {st_mode=S_IFREG|0640, st_size=10357041, ...}) = 0
[pid 4276] fstat(39, {st_mode=S_IFREG|0640, st_size=24506, ...}) = 0
[pid 4276] fstat(40, {st_mode=S_IFREG|0640, st_size=7103, ...}) = 0
[pid 4276] fstat(41, {st_mode=S_IFREG|0640, st_size=7404, ...}) = 0
[pid 4276] fstat(42, {st_mode=S_IFREG|0640, st_size=4401, ...}) = 0
[pid 4276] fstat(45, {st_mode=S_IFREG|0640, st_size=51445665, ...}) = 0
[pid 4276] fstat(37, {st_mode=S_IFREG|0640, st_size=43090, ...}) = 0
[pid 4276] fstat(43, {st_mode=S_IFREG|0640, st_size=14383, ...}) = 0
[pid 4276] fstat(44, {st_mode=S_IFREG|0640, st_size=58678, ...}) = 0
[pid 4276] fstat(48, {st_mode=S_IFREG|0640, st_size=4535, ...}) = 0
[pid 4276] fstat(91, {st_mode=S_IFREG|0640, st_size=535303443, ...}) = 0
[pid 4276] fstat(93, {st_mode=S_IFREG|0640, st_size=3253, ...}) = 0
[pid 4276] fstat(100, {st_mode=S_IFREG|0640, st_size=8808, ...}) = 0
[pid 4276] fstat(50, {st_mode=S_IFREG|0640, st_size=9592429, ...}) = 0
[pid 4276] fstat(47, {st_mode=S_IFREG|0640, st_size=55201, ...}) = 0
[pid 4276] fstat(95, {st_mode=S_IFREG|0640, st_size=358890, ...}) = 0
[pid 4276] fstat(101, {st_mode=S_IFREG|0640, st_size=0, ...}) = 0
[pid 4276] fstat(98, {st_mode=S_IFREG|0640, st_size=399499, ...}) = 0
[pid 4276] fstat(99, {st_mode=S_IFREG|0640, st_size=3493987, ...}) = 0
[pid 4276] fstat(97, {st_mode=S_IFREG|0640, st_size=485074, ...}) = 0
[pid 4276] fstat(106, {st_mode=S_IFREG|0640, st_size=4988672, ...}) = 0
[pid 4276] fstat(94, {st_mode=S_IFREG|0640, st_size=1108480, ...}) = 0
[pid 4276] fstat(104, {st_mode=S_IFREG|0640, st_size=512, ...}) = 0
[pid 4276] fstat(105, {st_mode=S_IFREG|0640, st_size=826332, ...}) = 0
[pid 4276] fstat(108, {st_mode=S_IFREG|0640, st_size=170534, ...}) = 0
[pid 4276] fstat(109, {st_mode=S_IFREG|0640, st_size=54241668, ...}) = 0
[pid 4276] fstat(110, {st_mode=S_IFREG|0640, st_size=66118024, ...}) = 0
[pid 4276] fstat(113, {st_mode=S_IFREG|0640, st_size=2613, ...}) = 0
[pid 4276] fstat(122, {st_mode=S_IFREG|0640, st_size=0, ...}) = 0
[pid 4276] fstat(116, {st_mode=S_IFREG|0640, st_size=2113652, ...}) = 0
[pid 4276] fstat(118, {st_mode=S_IFREG|0640, st_size=477181, ...}) = 0
[pid 4276] fstat(120, {st_mode=S_IFREG|0640, st_size=1962, ...}) = 0
[pid 4276] fstat(121, {st_mode=S_IFREG|0640, st_size=262, ...}) = 0
[pid 4276] fstat(117, {st_mode=S_IFREG|0640, st_size=2341173, ...}) = 0
[pid 4276] fstat(119, {st_mode=S_IFREG|0640, st_size=1295136, ...}) = 0
[pid 4276] fstat(123, {st_mode=S_IFREG|0640, st_size=2306003, ...}) = 0
[pid 4276] fstat(169, {st_mode=S_IFREG|0640, st_size=6667120, ...}) = 0
[pid 4276] fstat(212, {st_mode=S_IFREG|0640, st_size=37270, ...}) = 0
[pid 4276] fstat(115, {st_mode=S_IFREG|0640, st_size=3417367, ...}) = 0
[pid 4276] fstat(171, {st_mode=S_IFREG|0640, st_size=229, ...}) = 0
[pid 4276] fstat(192, {st_mode=S_IFREG|0640, st_size=229, ...}) = 0
[pid 4276] fstat(214, {st_mode=S_IFREG|0640, st_size=872, ...}) = 0
[pid 4276] fstat(250, {st_mode=S_IFREG|0640, st_size=958, ...}) = 0
[pid 4276] fstat(270, {st_mode=S_IFREG|0640, st_size=1399694, ...}) = 0
[pid 4276] read(25, "\2\300\0\0\0\0\0\0", 8) = 8
[pid 4276] epoll_wait(8, [], 256, 0) = 0
[pid 4276] epoll_wait(8, [{EPOLLIN, {u32=2309334320, u64=140452034880816}}], 256, -1) = 1
[pid 4276] read(30, "]\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0", 16) = 16
[pid 4276] fstat(270, {st_mode=S_IFREG|0640, st_size=1399866, ...}) = 0
[pid 4276] fstat(270, {st_mode=S_IFREG|0640, st_size=1399866, ...}) = 0
[pid 4276] read(270, "{\"log\":\"10.42.5.1 - - [23/Oct/20"..., 999999) = 172
[pid 4276] openat(AT_FDCWD, "/buffers/tail.0/1-1666558108.597004765.flb", O_RDWR|O_CREAT, 0600) = 235
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1492976, ...}) = 0
[pid 4276] mmap(NULL, 1495040, PROT_READ|PROT_WRITE, MAP_SHARED, 235, 0) = 0x7fbd87091000
[pid 4276] fallocate(235, 0, 0, 1527808) = 0
[pid 4276] mremap(0x7fbd87091000, 1492976, 1527808, MREMAP_MAYMOVE) = 0x7fbd8628a000
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1527808, ...}) = 0
[pid 4276] ftruncate(235, 1494422) = 0
[pid 4276] mremap(0x7fbd8628a000, 1527808, 1494422, MREMAP_MAYMOVE) = 0x7fbd8628a000
[pid 4276] msync(0x7fbd8628a000, 1494422, MS_ASYNC) = 0
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1494422, ...}) = 0
[pid 4276] munmap(0x7fbd8628a000, 1494422) = 0
[pid 4276] fstat(235, {st_mode=S_IFREG|0600, st_size=1494422, ...}) = 0
[pid 4276] close(235) = 0
[pid 4276] lseek(28, 3992312, SEEK_SET) = 3992312```
Another strace output
strace: Process 4259 attached with 8 threads
[pid 4320] accept4(85, {sa_family=AF_INET, sin_port=htons(54944), sin_addr=inet_addr("10.42.6.245")}, [16], SOCK_CLOEXEC|SOCK_NONBLOCK) = 260
[pid 4320] recvfrom(260, "GET /api/v1/metrics/prometheus HTTP/1.1\r\nHost: 10.42.5.141:2020\r\nUser-Agent: Prometheus/2.28.1\r\nAccept: application/openmetrics-text; version=0.0.1,text/plain;version=0.0.4;q=0.5,*/*;q=0.1\r\nAccept-Encoding: gzip\r\nX-Prometheus-Scrape-Timeout-Seconds: 10\r\n\r\n", 4096, 0, NULL, NULL) = 256
[pid 4320] sendto(260, "CAF\r\n", 5, 0, NULL, 0) = 5
[pid 4320] sendto(260, "# HELP fluentbit_filter_add_records_total Fluentbit metrics.\n# TYPE fluentbit_filter_add_records_total counter\nfluentbit_filter_add_records_total{name=\"kubernetes.0\"} 0 1666561263158\n# HELP fluentbit_filter_drop_records_total Fluentbit metrics.\n# TYPE fluentbit_filter_drop_records_total counter\nfluentbit_filter_drop_records_total{name=\"kubernetes.0\"} 0 1666561263158\n# HELP fluentbit_input_bytes_total Number of input bytes.\n# TYPE fluentbit_input_bytes_total counter\nfluentbit_input_bytes_total{name=\"storage_backlog.1\"} 0 1666561263158\nfluentbit_input_bytes_total{name=\"tail.0\"} 781354425 1666561263158\n# HELP fluentbit_input_files_closed_total Fluentbit metrics.\n# TYPE fluentbit_input_files_closed_total counter\nfluentbit_input_files_closed_total{name=\"tail.0\"} 43 1666561263158\n# HELP fluentbit_input_files_opened_total Fluentbit metrics.\n# TYPE fluentbit_input_files_opened_total counter\nfluentbit_input_files_opened_total{name=\"tail.0\"} 93 1666561263158\n# HELP fluentbit_input_files_rotated_total Fluentbit metrics.\n# TYPE fluentbit_input_files_rotated_total counter\nfluentbit_input_files_rotated_total{name=\"tail.0\"} 0 1666561263158\n# HELP fluentbit_input_records_total Number of input records.\n# TYPE fluentbit_input_records_total counter\nfluentbit_input_records_total{name=\"storage_backlog.1\"} 0 1666561263158\nfluentbit_input_records_total{name=\"tail.0\"} 3906850 1666561263158\n# HELP fluentbit_output_dropped_records_total Number of dropped records.\n# TYPE fluentbit_output_dropped_records_total counter\nfluentbit_output_dropped_records_total{name=\"forward.0\"} 0 1666561263158\n# HELP fluentbit_output_errors_total Number of output errors.\n# TYPE fluentbit_output_errors_total counter\nfluentbit_output_errors_total{name=\"forward.0\"} 0 1666561263158\n# HELP fluentbit_output_proc_bytes_total Number of processed output bytes.\n# TYPE fluentbit_output_proc_bytes_total counter\nfluentbit_output_proc_bytes_total{name=\"forward.0\"} 150836065 1666561263158\n# HELP fluentbit_output_proc_records_total Number of processed output records.\n# TYPE fluentbit_output_proc_records_total counter\nfluentbit_output_proc_records_total{name=\"forward.0\"} 131853 1666561263158\n# HELP fluentbit_output_retried_records_total Number of retried records.\n# TYPE fluentbit_output_retried_records_total counter\nfluentbit_output_retried_records_total{name=\"forward.0\"} 294516 1666561263158\n# HELP fluentbit_output_retries_failed_total Number of abandoned batches because the maximum number of re-tries was reached.\n# TYPE fluentbit_output_retries_failed_total counter\nfluentbit_output_retries_failed_total{name=\"forward.0\"} 0 1666561263158\n# HELP fluentbit_output_retries_total Number of output retries.\n# TYPE fluentbit_output_retries_total counter\nfluentbit_output_retries_total{name=\"forward.0\"} 14046 1666561263158\n# HELP fluentbit_uptime Number of seconds that Fluent Bit has been running.\n# TYPE fluentbit_uptime counter\nfluentbit_uptime 400321\n# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.\n# TYPE process_start_time_seconds gauge\nprocess_start_time_seconds 1666160942\n# HELP fluentbit_build_info Build version information.\n# TYPE fluentbit_build_info gauge\nfluentbit_build_info{version=\"1.9.5\",edition=\"Community\"} 1\n", 3247, 0, NULL, 0) = 3247
[pid 4320] sendto(260, "\r\n", 2, 0, NULL, 0) = 2
[pid 4320] sendto(260, "0\r\n\r\n", 5, 0, NULL, 0) = 5
[pid 4320] accept4(85, {sa_family=AF_INET, sin_port=htons(44824), sin_addr=inet_addr("10.42.6.245")}, [16], SOCK_CLOEXEC|SOCK_NONBLOCK) = 210
[pid 4320] recvfrom(210, "GET /api/v1/metrics/prometheus HTTP/1.1\r\nHost: 10.42.5.141:2020\r\nUser-Agent: Prometheus/2.28.1\r\nAccept: application/openmetrics-text; version=0.0.1,text/plain;version=0.0.4;q=0.5,*/*;q=0.1\r\nAccept-Encoding: gzip\r\nX-Prometheus-Scrape-Timeout-Seconds: 10\r\n\r\n", 4096, 0, NULL, NULL) = 256
strace: Process 4238 attached with 15 threads
[pid 38058] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=4238, si_uid=0} ---
[pid 4246] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=4238, si_uid=0} ---
[pid 38058] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=4238, si_uid=0} ---
[pid 4246] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=4238, si_uid=0} ---
[pid 3003214] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=4238, si_uid=0} ---
[pid 4251] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=4238, si_uid=0} ---```
More updated strace output isn link below .
https://gist.github.com/kingnarmer/b7cef66798a38e833798522fe17eeea8
This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days. Maintainers can add the exempt-stale
label.
Appreciate help remediating this issue.
This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days. Maintainers can add the exempt-stale
label.
This issue was closed because it has been stalled for 5 days with no activity.
Bug Report
Describe the bug I am running into issues with k8s fluentbit not recovering after fluentd restart. I have read all similar github issues opened for this error, non of workarounds resolved the issue on my case.
Setup I have is fluentbit (1.9.7) --> fluentd --> kafka(3.2.1) , fluentd service is ClusterIP .
Appreciate any pointer to remediate this issue.
Thanks
Summary of changes tried
To Reproduce
[tls] error: unexpected EOF [output:forward:forward.0] no upstream connections available
Then followed by several logs of
[ info] [task] re-schedule retry=0x7ff34206b938 2040 in the next 56 seconds