Closed nigoroll closed 1 year ago
core analysis:
(gdb) bt
#0 0x00007f41e457c387 in raise () from /lib64/libc.so.6
#1 0x00007f41e457da78 in abort () from /lib64/libc.so.6
#2 0x000000000044edb6 in pan_ic (func=0x51b5d4 <__func__.9484> "cli_quit", file=0x51b03b "cache/cache_main.c", line=375,
cond=0x51b335 "It's time for the big quit", kind=VAS_WRONG) at cache/cache_panic.c:848
#3 0x00000000004eec40 in VAS_Fail (func=0x51b5d4 <__func__.9484> "cli_quit", file=0x51b03b "cache/cache_main.c", line=375,
cond=0x51b335 "It's time for the big quit", kind=VAS_WRONG) at vas.c:67
#4 0x0000000000448264 in cli_quit (sig=3) at cache/cache_main.c:375
#5 <signal handler called>
#6 0x00007f41e491fa33 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#7 0x0000000000447169 in Lck_CondWaitUntil (cond=0x7f41bd046980, lck=0x7f41bd046948, when=0) at cache/cache_lck.c:239
#8 0x0000000000446f57 in Lck_CondWait (cond=0x7f41bd046980, lck=0x7f41bd046948) at cache/cache_lck.c:212
#9 0x00007f41c358cd0c in dom_healthy (ctx=0x79a740 <ctx_cli>, d=<optimized out>, changed=0x7ffcac728988) at vmod_dynamic.c:310
#10 0x000000000042b27b in VRT_Healthy (ctx=0x79a740 <ctx_cli>, d=0x7f41bd250970, changed=0x7ffcac728988) at cache/cache_director.c:251
#11 0x00007f41c4fc4591 in vdir_update_health (ctx=0x79a740 <ctx_cli>, vd=0x7f41e3cbf900) at vmod_directors.c:344
#12 0x00007f41c4fc4d37 in vmod_fallback_list (ctx=0x79a740 <ctx_cli>, dir=0x7f41bd249880, vsb=0x7f41e3f2a790, pflag=0, jflag=0)
at vmod_directors_fall_back.c:92
#13 0x000000000042b6f1 in do_list (cli=0x7f41e3cc0830, d=0x7f41bd249880, priv=0x7ffcac728b80) at cache/cache_director.c:342
#14 0x000000000045ff11 in vcl_iterdir (cli=0x7f41e3cc0830, pat=0x7f41bd274540 "reload-varnish_20230712071723_4427.*",
vcl=0x7f41e3ffa7a0, func=0x42b52a <do_list>, priv=0x7ffcac728b80) at cache/cache_vcl.c:384
#15 0x000000000046017b in VCL_IterDirector (cli=0x7f41e3cc0830, pat=0x7f41e3d511c0 "reload-varnish_20230712071723_4427.*",
func=0x42b52a <do_list>, priv=0x7ffcac728b80) at cache/cache_vcl.c:423
#16 0x000000000042bf2c in cli_backend_list (cli=0x7f41e3cc0830, av=0x7f41e3c1d600, priv=0x0) at cache/cache_director.c:448
#17 0x00000000004f077a in cls_dispatch (cli=0x7f41e3cc0830, cp=0x793d20 <backend_cmds>, av=0x7f41e3c1d600, ac=2) at vcli_serve.c:255
#18 0x00000000004f0aee in cls_exec (cfd=0x7f41e3cc0800, av=0x7f41e3c1d600) at vcli_serve.c:320
#19 0x00000000004f1098 in cls_feed (cfd=0x7f41e3cc0800,
p=0x7ffcac728d52 "\noad-varnish_20230712071723_4427.1689146243.162252/vgc.so 1auto\nND__",
e=0x7ffcac728d53 "oad-varnish_20230712071723_4427.1689146243.162252/vgc.so 1auto\nND__") at vcli_serve.c:416
#20 0x00000000004f2183 in VCLS_Poll (cs=0x7f41e3cc5fb0, cli=0x7f41e3cc0830, timeout=-1) at vcli_serve.c:608
#21 0x0000000000426ab4 in CLI_Run () at cache/cache_cli.c:110
#22 0x00000000004485dd in child_main (sigmagic=1, altstksz=131072) at cache/cache_main.c:465
#23 0x000000000049cae7 in mgt_launch_child (cli=0x0) at mgt/mgt_child.c:408
#24 0x000000000049d98f in MCH_Start_Child () at mgt/mgt_child.c:703
#25 0x00000000004a44a8 in main (argc=56, argv=0x7ffcac72b0e8) at mgt/mgt_main.c:959
(gdb) thr 166
[Switching to thread 166 (Thread 0x7f415be42700 (LWP 4447))]
#0 0x00007f41e492254d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0 0x00007f41e492254d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f41e491de9b in _L_lock_883 () from /lib64/libpthread.so.0
#2 0x00007f41e491dd68 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004469e9 in Lck__Lock (lck=0x79b690 <vcl_mtx>, p=0x5260b0 <__func__.8910> "VRT_AddDirector", l=211)
at cache/cache_lck.c:124
#4 0x0000000000474e94 in VRT_AddDirector (ctx=0x7f415be41a10, m=0x512b40 <vbe_methods>, priv=0x7f415cc12100, fmt=0x512ecf "%s")
at cache/cache_vrt_vcl.c:211
#5 0x000000000041d04f in VRT_new_backend_clustered (ctx=0x7f415be41a10, vc=0x0, vrt=0x7f415be41930, via=0x7f41e3fed880)
at cache/cache_backend.c:738
#6 0x000000000041d1a7 in VRT_new_backend (ctx=0x7f415be41a10, vrt=0x7f415be41930, via=0x7f41e3fed880) at cache/cache_backend.c:756
#7 0x00007f41c358b16d in ref_add (r=0x7f415ac12040, ctx=0x7f415be41a10) at vmod_dynamic.c:605
#8 dom_update (res=<optimized out>, now=<optimized out>, priv=<optimized out>, dom=0x7f41bd046900) at vmod_dynamic.c:716
#9 dom_lookup_thread (priv=0x7f41bd046900) at vmod_dynamic.c:796
#10 0x00007f41e491bea5 in start_thread () from /lib64/libpthread.so.0
#11 0x00007f41e4644b0d in clone () from /lib64/libc.so.6
So a backend.list
CLI command checks the health state. dom_healthy
encounters a delayed creation and blocks here:
while at the same time, creation of the backend is waiting for the cli_mtx
.
So basically a similar problem as the one which we handled in 821ad8f5bb26f77f9f377cd19a60f810afbf65cc: The list
callback can depend on the cli_mtx
, also not indirectly.
Seen with aa7919e76a3184d158a4d7f1698ad8925353d4b4 on varnish-cache unmerged_code_20230711_235413 which is based on d99f62a9c5d4c6de492abae63510555d51a53324