Open uristernik opened 2 weeks ago
The issue reproduced in another cluster with the stable 3.1.9 build.
The dump looks a bit different. It is still stuck on phn_next_set
, but it looks like this time different functions are called which lead to deadlock
Thread 6 (Thread 0xffff821ea9c0 (LWP 20)):
#0 0x0000ffff86f37c74 in __GI_epoll_pwait (epfd=75, events=0xffff8120c000, maxevents=256, timeout=-1, set=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:40
#1 0x0000aaaae6b1db80 in _mk_event_wait_2 (loop=0xffff81216000, timeout=-1) at /src/fluent-bit/lib/monkey/mk_core/mk_event_epoll.c:444
#2 0x0000aaaae6b1dfe4 in mk_event_wait (loop=0xffff81216000) at /src/fluent-bit/lib/monkey/mk_core/mk_event.c:207
#3 0x0000aaaae6b166a0 in mk_server_worker_loop (server=0xffff84865700) at /src/fluent-bit/lib/monkey/mk_server/mk_server.c:506
#4 0x0000aaaae6b0d790 in mk_sched_launch_worker_loop (data=0xffff82c00008) at /src/fluent-bit/lib/monkey/mk_server/mk_scheduler.c:417
#5 0x0000ffff86ecee30 in start_thread (arg=0xffff83b69f77) at ./nptl/pthread_create.c:442
#6 0x0000ffff86f37adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 5 (Thread 0xffff82bfa9c0 (LWP 19)):
#0 0x0000ffff86f036b0 in __GI___clock_nanosleep (clock_id=<optimized out>, clock_id@entry=0, flags=flags@entry=0, req=req@entry=0xffff82bfa108, rem=rem@entry=0xffff82bfa108) at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:48
#1 0x0000ffff86f0861c in __GI___nanosleep (req=req@entry=0xffff82bfa108, rem=rem@entry=0xffff82bfa108) at ../sysdeps/unix/sysv/linux/nanosleep.c:25
#2 0x0000ffff86f084e8 in __sleep (seconds=0) at ../sysdeps/posix/sleep.c:55
#3 0x0000aaaae6b19f30 in mk_clock_worker_init (data=0xffff84865700) at /src/fluent-bit/lib/monkey/mk_server/mk_clock.c:124
#4 0x0000ffff86ecee30 in start_thread (arg=0xffff83b69fa7) at ./nptl/pthread_create.c:442
#5 0x0000ffff86f37adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 4 (Thread 0xffff83b6a9c0 (LWP 18)):
#0 0x0000ffff86f37c74 in __GI_epoll_pwait (epfd=69, events=0xffff83cb8200, maxevents=8, timeout=-1, set=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:40
#1 0x0000aaaae6b1db80 in _mk_event_wait_2 (loop=0xffff83c9dcb0, timeout=-1) at /src/fluent-bit/lib/monkey/mk_core/mk_event_epoll.c:444
#2 0x0000aaaae6b1dfe4 in mk_event_wait (loop=0xffff83c9dcb0) at /src/fluent-bit/lib/monkey/mk_core/mk_event.c:207
#3 0x0000aaaae6b04e14 in mk_lib_worker (data=0xffff83c9dcc8) at /src/fluent-bit/lib/monkey/mk_server/mk_lib.c:154
#4 0x0000ffff86ecee30 in start_thread (arg=0xffff857f9e57) at ./nptl/pthread_create.c:442
#5 0x0000ffff86f37adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 3 (Thread 0xffff847fa9c0 (LWP 17)):
#0 0x0000ffff86f37c74 in __GI_epoll_pwait (epfd=11, events=0xffff8481d000, maxevents=32, timeout=-1, set=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:40
#1 0x0000aaaae6b1db80 in _mk_event_wait_2 (loop=0xffff8480b018, timeout=-1) at /src/fluent-bit/lib/monkey/mk_core/mk_event_epoll.c:444
#2 0x0000aaaae6b1dfe4 in mk_event_wait (loop=0xffff8480b018) at /src/fluent-bit/lib/monkey/mk_core/mk_event.c:207
#3 0x0000aaaae6643bac in log_worker_collector (data=0xffff84816000) at /src/fluent-bit/src/flb_log.c:131
#4 0x0000aaaae666c6ec in step_callback (data=<optimized out>) at /src/fluent-bit/src/flb_worker.c:43
#5 0x0000ffff86ecee30 in start_thread (arg=0xffff857f9e67) at ./nptl/pthread_create.c:442
#6 0x0000ffff86f37adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 2 (Thread 0xffff857fa9c0 (LWP 16)):
#0 phn_next_set (offset=40, next=0x0, phn=0xffff84a1f680) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:76
#1 phn_merge_siblings (cmp=<optimized out>, offset=40, phn=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:206
#2 ph_merge_children (cmp=<optimized out>, offset=<optimized out>, phn=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:244
#3 ph_remove (cmp=<optimized out>, offset=40, phn=phn@entry=0xffff84a26c80, ph=ph@entry=0xffff84a03c50) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:389
#4 je_edata_heap_remove (ph=ph@entry=0xffff84a03c50, phn=phn@entry=0xffff84a26c80) at /src/fluent-bit/lib/jemalloc-5.3.0/src/edata.c:6
#5 0x0000aaaae66158dc in je_eset_remove (eset=0xffff84a03ab0, edata=edata@entry=0xffff84a26c80) at /src/fluent-bit/lib/jemalloc-5.3.0/src/eset.c:126
#6 0x0000aaaae6618d34 in extent_activate_locked (edata=0xffff84a26c80, eset=<optimized out>, ecache=0xffff84a03a38, pac=0xffff84a03a00, tsdn=0xffff857fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:280
#7 extent_recycle_extract (ehooks=0xffff84a000c0, guarded=<optimized out>, alignment=64, size=69632, expand_edata=0x0, ecache=0xffff84a03a38, pac=0xffff84a03a00, tsdn=0xffff857fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:444
#8 extent_recycle (tsdn=tsdn@entry=0xffff857fb110, pac=pac@entry=0xffff84a03a00, ehooks=ehooks@entry=0xffff84a000c0, ecache=ecache@entry=0xffff84a03a38, expand_edata=expand_edata@entry=0x0, size=size@entry=69632, alignment=alignment@entry=64, zero=zero@entry=false, commit=commit@entry=0xffff857f8fef, growing_retained=growing_retained@entry=false, guarded=guarded@entry=false) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:606
#9 0x0000aaaae6618fd0 in je_ecache_alloc (tsdn=tsdn@entry=0xffff857fb110, pac=pac@entry=0xffff84a03a00, ehooks=ehooks@entry=0xffff84a000c0, ecache=ecache@entry=0xffff84a03a38, expand_edata=expand_edata@entry=0x0, size=size@entry=69632, alignment=alignment@entry=64, zero=zero@entry=false, guarded=guarded@entry=false) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:87
#10 0x0000aaaae662354c in pac_alloc_real (guarded=false, zero=false, alignment=64, size=69632, ehooks=0xffff84a000c0, pac=0xffff84a03a00, tsdn=0xffff857fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/pac.c:116
#11 pac_alloc_impl (tsdn=0xffff857fb110, self=0xffff84a03a00, size=69632, alignment=64, zero=<optimized out>, guarded=<optimized out>, frequent_reuse=<optimized out>, deferred_work_generated=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/pac.c:178
#12 0x0000aaaae6622058 in pai_alloc (deferred_work_generated=0xffff857f9137, frequent_reuse=false, guarded=false, zero=false, alignment=64, size=69632, self=0xffff84a03a00, tsdn=0xffff857fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/pai.h:43
#13 je_pa_alloc (tsdn=tsdn@entry=0xffff857fb110, shard=shard@entry=0xffff84a039e8, size=size@entry=69632, alignment=alignment@entry=64, slab=slab@entry=false, szind=szind@entry=47, zero=false, guarded=false, deferred_work_generated=deferred_work_generated@entry=0xffff857f9137) at /src/fluent-bit/lib/jemalloc-5.3.0/src/pa.c:139
#14 0x0000aaaae65ea210 in je_arena_extent_alloc_large (tsdn=tsdn@entry=0xffff857fb110, arena=arena@entry=0xffff84a01040, usize=usize@entry=65536, alignment=alignment@entry=64, zero=zero@entry=false) at /src/fluent-bit/lib/jemalloc-5.3.0/src/arena.c:338
#15 0x0000aaaae661ff94 in je_large_palloc (tsdn=tsdn@entry=0xffff857fb110, arena=<optimized out>, usize=65536, alignment=alignment@entry=64, zero=false) at /src/fluent-bit/lib/jemalloc-5.3.0/src/large.c:37
#16 0x0000aaaae662042c in je_large_malloc (tsdn=tsdn@entry=0xffff857fb110, arena=<optimized out>, usize=<optimized out>, zero=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/large.c:17
#17 0x0000aaaae65ed8d4 in je_arena_malloc_hard (tsdn=tsdn@entry=0xffff857fb110, arena=<optimized out>, arena@entry=0x0, size=size@entry=65536, ind=ind@entry=47, zero=zero@entry=false) at /src/fluent-bit/lib/jemalloc-5.3.0/src/arena.c:1205
#18 0x0000aaaae65ee738 in arena_malloc (slow_path=true, tcache=0xffff857fb478, zero=false, ind=<optimized out>, size=65536, arena=0x0, tsdn=0xffff857fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/arena_inlines_b.h:162
#19 arena_ralloc_move_helper (tcache=0xffff857fb478, zero=false, alignment=8192, usize=65536, arena=0x0, tsdn=0xffff857fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/arena.c:1446
#20 je_arena_ralloc (tsdn=tsdn@entry=0xffff857fb110, arena=0x0, ptr=ptr@entry=0xffff83d02000, oldsize=oldsize@entry=8192, size=size@entry=65536, alignment=alignment@entry=0, zero=zero@entry=false, tcache=tcache@entry=0xffff857fb478, hook_args=hook_args@entry=0xffff857f9448) at /src/fluent-bit/lib/jemalloc-5.3.0/src/arena.c:1488
#21 0x0000aaaae65deb64 in iralloct (hook_args=0xffff857f9448, arena=<optimized out>, tcache=0xffff857fb478, zero=false, alignment=<optimized out>, size=65536, oldsize=8192, ptr=0xffff83d02000, tsdn=0xffff857fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/jemalloc_internal_inlines_c.h:194
#22 do_rallocx (ptr=0xffff83d02000, size=65536, flags=<optimized out>, is_realloc=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/jemalloc.c:3537
#23 0x0000aaaae669677c in msgpack_sbuffer_write (data=0xffff857f9b60, buf=0xffff810226c0 "\337", len=50427) at /src/fluent-bit/lib/msgpack-c/include/msgpack/sbuffer.h:81
#24 0x0000aaaae669517c in msgpack_pack_str_body (l=50427, b=0xffff810226c0, x=<optimized out>) at /src/fluent-bit/lib/msgpack-c/include/msgpack/pack_template.h:784
#25 flb_log_event_encoder_append_value (context=<optimized out>, target_field=<optimized out>, increment_entry_count=<optimized out>, value_type=<optimized out>, value_buffer=0xffff810226c0 "\337", value_length=50427) at /src/fluent-bit/src/flb_log_event_encoder_primitives.c:146
#26 0x0000aaaae66959c4 in flb_log_event_encoder_append_raw_msgpack (context=<optimized out>, target_field=<optimized out>, value_buffer=<optimized out>, value_size=<optimized out>) at /src/fluent-bit/src/flb_log_event_encoder_primitives.c:468
#27 0x0000aaaae6694ae4 in flb_log_event_encoder_emit_record (context=context@entry=0xffff857f9a38) at /src/fluent-bit/src/flb_log_event_encoder.c:200
#28 0x0000aaaae6694bf0 in flb_log_event_encoder_commit_record (context=context@entry=0xffff857f9a38) at /src/fluent-bit/src/flb_log_event_encoder.c:267
#29 0x0000aaaae6789a30 in cb_kube_filter (data=0xffff808b3b00, bytes=<optimized out>, tag=0xffff8482a740 "kube.var.log.containers.<redacted>.log", tag_len=176, out_buf=0xffff857f9cc8, out_bytes=0xffff857f9cd0, f_ins=<optimized out>, i_ins=<optimized out>, filter_context=0xffff83b7cd40, config=0xffff85c3ea40) at /src/fluent-bit/plugins/filter_kubernetes/kubernetes.c:724
#30 0x0000aaaae6651f84 in flb_filter_do (ic=ic@entry=0xffff80f04e20, data=0xffff808b3b00, bytes=<optimized out>, out_data=out_data@entry=0xffff857f9da0, out_bytes=out_bytes@entry=0xffff857f9da8, tag=tag@entry=0xffff8499b580 "kube.var.log.containers.<redacted>.log", tag_len=tag_len@entry=176, config=0xffff85c3ea40) at /src/fluent-bit/src/flb_filter.c:158
#31 0x0000aaaae664f694 in input_chunk_append_raw (in=0xffff85ca1400, event_type=0, n_records=<optimized out>, tag=0xffff8499b580 "kube.var.log.containers.<redacted>.log", tag_len=<optimized out>, buf=0xffff808b3b00, buf_size=28856) at /src/fluent-bit/src/flb_input_chunk.c:1588
#32 0x0000aaaae664feb4 in flb_input_chunk_append_raw (in=in@entry=0xffff85ca1400, event_type=event_type@entry=0, records=records@entry=2, tag=tag@entry=0xffff8499b580 "kube.var.log.containers.<redacted>.log", tag_len=tag_len@entry=176, buf=<optimized out>, buf_size=<optimized out>) at /src/fluent-bit/src/flb_input_chunk.c:1929
#33 0x0000aaaae66903f8 in input_log_append (ins=0xffff85ca1400, processor_starting_stage=processor_starting_stage@entry=0, records=2, tag=0xffff8499b580 "kube.var.log.containers.<redacted>.log", tag_len=176, buf=0xffff808b3b00, buf_size=28856) at /src/fluent-bit/src/flb_input_log.c:71
#34 0x0000aaaae6690494 in flb_input_log_append (ins=<optimized out>, tag=<optimized out>, tag_len=<optimized out>, buf=<optimized out>, buf_size=<optimized out>) at /src/fluent-bit/src/flb_input_log.c:90
#35 0x0000aaaae66c366c in ml_stream_buffer_flush (file=0xffff848b8640, ctx=0xffff84832400) at /src/fluent-bit/plugins/in_tail/tail_file.c:412
#36 process_content (bytes=<synthetic pointer>, file=0xffff848b8640) at /src/fluent-bit/plugins/in_tail/tail_file.c:630
#37 flb_tail_file_chunk (file=file@entry=0xffff848b8640) at /src/fluent-bit/plugins/in_tail/tail_file.c:1555
#38 0x0000aaaae66be0c0 in in_tail_collect_static (ins=<optimized out>, config=0xffff85c3ea40, in_context=0xffff84832400) at /src/fluent-bit/plugins/in_tail/tail.c:200
#39 0x0000aaaae664d18c in flb_input_collector_fd (fd=fd@entry=23, config=config@entry=0xffff85c3ea40) at /src/fluent-bit/src/flb_input.c:1970
#40 0x0000aaaae666372c in flb_engine_handle_event (config=0xffff85c3ea40, mask=<optimized out>, fd=23) at /src/fluent-bit/src/flb_engine.c:575
#41 flb_engine_start (config=config@entry=0xffff85c3ea40) at /src/fluent-bit/src/flb_engine.c:941
#42 0x0000aaaae6642c14 in flb_lib_worker (data=0xffff85c183a0) at /src/fluent-bit/src/flb_lib.c:674
#43 0x0000ffff86ecee30 in start_thread (arg=0xffffe0335d57) at ./nptl/pthread_create.c:442
#44 0x0000ffff86f37adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 1 (Thread 0xffff87779020 (LWP 15)):
#0 0x0000ffff86f036b0 in __GI___clock_nanosleep (clock_id=<optimized out>, clock_id@entry=0, flags=flags@entry=0, req=req@entry=0xffffe0335ee8, rem=rem@entry=0xffffe0335ee8) at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:48
#1 0x0000ffff86f0861c in __GI___nanosleep (req=req@entry=0xffffe0335ee8, rem=rem@entry=0xffffe0335ee8) at ../sysdeps/unix/sysv/linux/nanosleep.c:25
#2 0x0000ffff86f084e8 in __sleep (seconds=0, seconds@entry=1) at ../sysdeps/posix/sleep.c:55
#3 0x0000aaaae65d6e60 in flb_main (argc=<optimized out>, argv=<optimized out>) at /src/fluent-bit/src/fluent-bit.c:1388
#4 0x0000ffff86e77740 in __libc_start_call_main (main=main@entry=0xaaaae65d55c0 <main>, argc=argc@entry=4, argv=argv@entry=0xffffe03361a8) at ../sysdeps/nptl/libc_start_call_main.h:58
#5 0x0000ffff86e77818 in __libc_start_main_impl (main=0xaaaae65d55c0 <main>, argc=4, argv=0xffffe03361a8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=<optimized out>) at ../csu/libc-start.c:360
#6 0x0000aaaae65d5670 in _start ()
Here is another example:
Thread 2 (Thread 0xffff89bfa9c0 (LWP 18)):
#0 phn_prev_set (offset=40, prev=0x0, phn=0xffff88e35d80) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:76
#1 phn_merge_siblings (cmp=<optimized out>, offset=40, phn=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:184
#2 ph_merge_children (cmp=<optimized out>, offset=<optimized out>, phn=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:244
#3 ph_remove (cmp=<optimized out>, offset=40, phn=phn@entry=0xffff88e35700, ph=ph@entry=0xffff88e03c50) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:389
#4 je_edata_heap_remove (ph=ph@entry=0xffff88e03c50, phn=phn@entry=0xffff88e35700) at /src/fluent-bit/lib/jemalloc-5.3.0/src/edata.c:6
#5 0x0000aaaad76858dc in je_eset_remove (eset=eset@entry=0xffff88e03ab0, edata=edata@entry=0xffff88e35700) at /src/fluent-bit/lib/jemalloc-5.3.0/src/eset.c:126
#6 0x0000aaaad7686628 in extent_coalesce (forward=false, outer=0xffff88e35700, inner=0xffff88e33980, ecache=<optimized out>, ehooks=0xffff88e000c0, pac=0xffff88e03a00, tsdn=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:806
#7 extent_try_coalesce_impl (tsdn=tsdn@entry=0xffff89bfb110, pac=pac@entry=0xffff88e03a00, ehooks=ehooks@entry=0xffff88e000c0, ecache=ecache@entry=0xffff88e03a38, edata=edata@entry=0xffff88e33980, coalesced=coalesced@entry=0xffff89bf92ef) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:855
#8 0x0000aaaad7687fbc in extent_try_coalesce_large (coalesced=0xffff89bf92ef, edata=0xffff88e33980, ecache=0xffff88e03a38, ehooks=0xffff88e000c0, pac=0xffff88e03a00, tsdn=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:884
#9 je_extent_record (tsdn=tsdn@entry=0xffff89bfb110, pac=pac@entry=0xffff88e03a00, ehooks=ehooks@entry=0xffff88e000c0, ecache=ecache@entry=0xffff88e03a38, edata=edata@entry=0xffff88e33980) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:937
#10 0x0000aaaad7688134 in je_ecache_dalloc (tsdn=tsdn@entry=0xffff89bfb110, pac=pac@entry=0xffff88e03a00, ehooks=ehooks@entry=0xffff88e000c0, ecache=ecache@entry=0xffff88e03a38, edata=edata@entry=0xffff88e33980) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:147
#11 0x0000aaaad7692d38 in pac_dalloc_impl (tsdn=0xffff89bfb110, self=0xffff88e03a00, edata=0xffff88e33980, deferred_work_generated=0xffff89bf935f) at /src/fluent-bit/lib/jemalloc-5.3.0/src/pac.c:277
#12 0x0000aaaad7690880 in large_dalloc_finish_impl (edata=<optimized out>, arena=0xffff88e01040, tsdn=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/large.c:253
#13 je_large_dalloc_finish (tsdn=tsdn@entry=0xffff89bfb110, edata=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/large.c:266
#14 0x0000aaaad769dc70 in tcache_bin_flush_impl (small=false, nflush=2, ptrs=0xffff89bfb468, binind=<optimized out>, cache_bin=<optimized out>, tcache=0xffff89c000c0, tsd=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/tcache.c:460
#15 tcache_bin_flush_bottom (small=false, rem=<optimized out>, binind=<optimized out>, cache_bin=0xffff89bf9838, tcache=0xffff89c000c0, tsd=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/tcache.c:519
#16 je_tcache_bin_flush_large (tsd=0xffff89bfb110, tcache=0xffff89c000c0, cache_bin=0xffff89bf9838, binind=<optimized out>, rem=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/tcache.c:535
#17 0x0000aaaad769f7bc in je_tcache_gc_dalloc_event_handler () at /src/fluent-bit/lib/jemalloc-5.3.0/src/tcache.c:223
#18 0x0000aaaad76a1d58 in je_te_event_trigger (tsd=0xffff89bfb110, ctx=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/thread_event.c:299
#19 0x0000aaaad7652948 in te_event_advance (is_alloc=false, usize=524288, tsd=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/thread_event.h:287
#20 thread_dalloc_event (usize=524288, tsd=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/thread_event.h:293
#21 ifree (slow_path=false, tcache=0xffff89bfb478, ptr=<optimized out>, tsd=0xffff89bfb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/jemalloc.c:2896
#22 je_free_default (ptr=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/jemalloc.c:3014
#23 0x0000aaaad76530b8 in free (ptr=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/jemalloc.c:3162
#24 0x0000aaaad7b73be4 in decr_count (buffer=<optimized out>) at /src/fluent-bit/lib/msgpack-c/src/unpack.c:348
#25 0x0000aaaad7804498 in apply_modifying_rules (ctx=<optimized out>, log_event=0xffff89bf9730, log_encoder=0xffff89bf9838) at /src/fluent-bit/plugins/filter_modify/modify.c:1448
#26 cb_modify_filter (data=<optimized out>, bytes=166552, tag=<optimized out>, tag_len=<optimized out>, out_buf=0xffff89bf9ac8, out_size=<optimized out>, f_ins=<optimized out>, i_ins=<optimized out>, context=<optimized out>, config=0xffff8a03ef00) at /src/fluent-bit/plugins/filter_modify/modify.c:1526
#27 0x0000aaaad76c1f84 in flb_filter_do (ic=ic@entry=0xffff850e8280, data=0xffff83322f00, bytes=<optimized out>, out_data=out_data@entry=0xffff89bf9ba0, out_bytes=out_bytes@entry=0xffff89bf9ba8, tag=tag@entry=0xffff850eb0c0 "kube.var.log.containers.<redacted>.log", tag_len=tag_len@entry=149, config=0xffff8a03ef00) at /src/fluent-bit/src/flb_filter.c:158
#28 0x0000aaaad76bf694 in input_chunk_append_raw (in=0xffff8a0a1400, event_type=0, n_records=<optimized out>, tag=0xffff850eb0c0 "kube.var.log.containers.<redacted>.log", tag_len=<optimized out>, buf=0xffff83322f00, buf_size=163930) at /src/fluent-bit/src/flb_input_chunk.c:1588
#29 0x0000aaaad76bfeb4 in flb_input_chunk_append_raw (in=in@entry=0xffff8a0a1400, event_type=event_type@entry=0, records=records@entry=1, tag=tag@entry=0xffff850eb0c0 "kube.var.log.containers.<redacted>.log", tag_len=tag_len@entry=149, buf=<optimized out>, buf_size=<optimized out>) at /src/fluent-bit/src/flb_input_chunk.c:1929
#30 0x0000aaaad77003f8 in input_log_append (ins=0xffff8a0a1400, processor_starting_stage=processor_starting_stage@entry=0, records=1, tag=0xffff850eb0c0 "kube.var.log.containers.<redacted>.log", tag_len=149, buf=0xffff83322f00, buf_size=163930) at /src/fluent-bit/src/flb_input_log.c:71
#31 0x0000aaaad7700494 in flb_input_log_append (ins=<optimized out>, tag=<optimized out>, tag_len=<optimized out>, buf=<optimized out>, buf_size=<optimized out>) at /src/fluent-bit/src/flb_input_log.c:90
#32 0x0000aaaad77327b8 in ml_stream_buffer_flush (file=0xffff88087380, ctx=0xffff88c32400) at /src/fluent-bit/plugins/in_tail/tail_file.c:412
#33 ml_flush_callback (parser=<optimized out>, mst=0xffff87ec1c60, data=0xffff88087380, buf_data=<optimized out>, buf_size=163930) at /src/fluent-bit/plugins/in_tail/tail_file.c:919
#34 0x0000aaaad76e8fb0 in flb_ml_flush_stream_group (ml_parser=0xffff8a04a070, mst=mst@entry=0xffff87ec1c60, group=group@entry=0xffff88024200, forced_flush=forced_flush@entry=1) at /src/fluent-bit/src/multiline/flb_ml.c:1516
#35 0x0000aaaad76e9624 in flb_ml_flush_parser_instance (ml=ml@entry=0xffff88c97000, parser_i=parser_i@entry=0xffff88c3f7a0, stream_id=stream_id@entry=8343155020032301013, forced_flush=forced_flush@entry=1) at /src/fluent-bit/src/multiline/flb_ml.c:117
#36 0x0000aaaad7707a9c in flb_ml_stream_id_destroy_all (ml=0xffff88c97000, stream_id=8343155020032301013) at /src/fluent-bit/src/multiline/flb_ml_stream.c:316
#37 0x0000aaaad7732dcc in flb_tail_file_remove (file=file@entry=0xffff88087380) at /src/fluent-bit/plugins/in_tail/tail_file.c:1256
#38 0x0000aaaad7735584 in flb_tail_file_purge (ins=0xffff8a0a1400, config=<optimized out>, context=0xffff88c32400) at /src/fluent-bit/plugins/in_tail/tail_file.c:1979
#39 0x0000aaaad76bd18c in flb_input_collector_fd (fd=fd@entry=121, config=config@entry=0xffff8a03ef00) at /src/fluent-bit/src/flb_input.c:1970
#40 0x0000aaaad76d372c in flb_engine_handle_event (config=0xffff8a03ef00, mask=<optimized out>, fd=121) at /src/fluent-bit/src/flb_engine.c:575
#41 flb_engine_start (config=config@entry=0xffff8a03ef00) at /src/fluent-bit/src/flb_engine.c:941
#42 0x0000aaaad76b2c14 in flb_lib_worker (data=0xffff8a0183a0) at /src/fluent-bit/src/flb_lib.c:674
#43 0x0000ffff8b36ee90 in start_thread (arg=0xffffd3286727) at ./nptl/pthread_create.c:442
#44 0x0000ffff8b3d7b1c in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
here is the IO on that exact pod
Bug Report
Describe the bug
Fluent-bit pod deadlocks, causing CPU to rise, no further log processing happens. I created a core dump and examined it:
To Reproduce
I am not able to reproduce at that moment. This doesn't happen frequently.
Expected behavior
No deadlocks. The only solution currently is to monitor pods that reach their CPU limit and kill them.
Screenshots
Your Environment
Additional context
Full config: