nigoroll / libvmod-dynamic

The Varnish dns/named director continued
BSD 2-Clause "Simplified" License
96 stars 34 forks source link

Hang & child cli timeout/quit related to VRT_Healthy #101

Closed nigoroll closed 1 year ago

nigoroll commented 1 year ago

Seen with aa7919e76a3184d158a4d7f1698ad8925353d4b4 on varnish-cache unmerged_code_20230711_235413 which is based on d99f62a9c5d4c6de492abae63510555d51a53324

Panic at: Wed, 12 Jul 2023 07:18:04 GMT
Wrong turn at cache/cache_main.c:375:
It's time for the big quit
version = varnish-trunk revision eb42e0a09dec02915caf6b821ff531f3d684de5b, vrt api = 17.0
ident = Linux,3.10.0-862.14.4.el7.x86_64,x86_64,-jnone,-smalloc,-sdefault,-hcritbit,epoll
now = 146433270.810059 (mono), 1689146282.167964 (real)
Backtrace:
  0x44e46c: /opt/varnish/sbin/varnishd() [0x44e46c]
  0x44eb79: /opt/varnish/sbin/varnishd() [0x44eb79]
  0x4eec40: /opt/varnish/sbin/varnishd(VAS_Fail+0x53) [0x4eec40]
  0x448264: /opt/varnish/sbin/varnishd() [0x448264]
  0x7f41e457c400: /lib64/libc.so.6(+0x36400) [0x7f41e457c400]
  0x7f41e491fa33: /lib64/libpthread.so.0(pthread_cond_wait+0xc3) [0x7f41e491fa33]
  0x447169: /opt/varnish/sbin/varnishd(Lck_CondWaitUntil+0x141) [0x447169]
  0x446f57: /opt/varnish/sbin/varnishd(Lck_CondWait+0x27) [0x446f57]
  0x7f41c358cd0c: ./vmod_cache/_vmod_dynamic.29b766e616262ab0ef9252419367ce48fbf642168ba34fc382b1742a77dac5c4(+0x7d0c) [0x7f41c358cd0c]
  0x42b27b: /opt/varnish/sbin/varnishd(VRT_Healthy+0x15e) [0x42b27b]
  0x7f41c4fc4591: ./vmod_cache/_vmod_directors.edab1c2b46260ba7857d12e0b786d29b78460cd930959505a67272486eac4683(+0x3591) [0x7f41c4fc4591]
  0x7f41c4fc4d37: ./vmod_cache/_vmod_directors.edab1c2b46260ba7857d12e0b786d29b78460cd930959505a67272486eac4683(+0x3d37) [0x7f41c4fc4d37]
  0x42b6f1: /opt/varnish/sbin/varnishd() [0x42b6f1]
  0x45ff11: /opt/varnish/sbin/varnishd() [0x45ff11]
  0x46017b: /opt/varnish/sbin/varnishd(VCL_IterDirector+0x23e) [0x46017b]
  0x42bf2c: /opt/varnish/sbin/varnishd() [0x42bf2c]
  0x4f077a: /opt/varnish/sbin/varnishd() [0x4f077a]
  0x4f0aee: /opt/varnish/sbin/varnishd() [0x4f0aee]
  0x4f1098: /opt/varnish/sbin/varnishd() [0x4f1098]
  0x4f2183: /opt/varnish/sbin/varnishd(VCLS_Poll+0x2c6) [0x4f2183]
nigoroll commented 1 year ago

core analysis:

(gdb) bt
#0  0x00007f41e457c387 in raise () from /lib64/libc.so.6
#1  0x00007f41e457da78 in abort () from /lib64/libc.so.6
#2  0x000000000044edb6 in pan_ic (func=0x51b5d4 <__func__.9484> "cli_quit", file=0x51b03b "cache/cache_main.c", line=375, 
    cond=0x51b335 "It's time for the big quit", kind=VAS_WRONG) at cache/cache_panic.c:848
#3  0x00000000004eec40 in VAS_Fail (func=0x51b5d4 <__func__.9484> "cli_quit", file=0x51b03b "cache/cache_main.c", line=375, 
    cond=0x51b335 "It's time for the big quit", kind=VAS_WRONG) at vas.c:67
#4  0x0000000000448264 in cli_quit (sig=3) at cache/cache_main.c:375
#5  <signal handler called>
#6  0x00007f41e491fa33 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#7  0x0000000000447169 in Lck_CondWaitUntil (cond=0x7f41bd046980, lck=0x7f41bd046948, when=0) at cache/cache_lck.c:239
#8  0x0000000000446f57 in Lck_CondWait (cond=0x7f41bd046980, lck=0x7f41bd046948) at cache/cache_lck.c:212
#9  0x00007f41c358cd0c in dom_healthy (ctx=0x79a740 <ctx_cli>, d=<optimized out>, changed=0x7ffcac728988) at vmod_dynamic.c:310
#10 0x000000000042b27b in VRT_Healthy (ctx=0x79a740 <ctx_cli>, d=0x7f41bd250970, changed=0x7ffcac728988) at cache/cache_director.c:251
#11 0x00007f41c4fc4591 in vdir_update_health (ctx=0x79a740 <ctx_cli>, vd=0x7f41e3cbf900) at vmod_directors.c:344
#12 0x00007f41c4fc4d37 in vmod_fallback_list (ctx=0x79a740 <ctx_cli>, dir=0x7f41bd249880, vsb=0x7f41e3f2a790, pflag=0, jflag=0)
    at vmod_directors_fall_back.c:92
#13 0x000000000042b6f1 in do_list (cli=0x7f41e3cc0830, d=0x7f41bd249880, priv=0x7ffcac728b80) at cache/cache_director.c:342
#14 0x000000000045ff11 in vcl_iterdir (cli=0x7f41e3cc0830, pat=0x7f41bd274540 "reload-varnish_20230712071723_4427.*", 
    vcl=0x7f41e3ffa7a0, func=0x42b52a <do_list>, priv=0x7ffcac728b80) at cache/cache_vcl.c:384
#15 0x000000000046017b in VCL_IterDirector (cli=0x7f41e3cc0830, pat=0x7f41e3d511c0 "reload-varnish_20230712071723_4427.*", 
    func=0x42b52a <do_list>, priv=0x7ffcac728b80) at cache/cache_vcl.c:423
#16 0x000000000042bf2c in cli_backend_list (cli=0x7f41e3cc0830, av=0x7f41e3c1d600, priv=0x0) at cache/cache_director.c:448
#17 0x00000000004f077a in cls_dispatch (cli=0x7f41e3cc0830, cp=0x793d20 <backend_cmds>, av=0x7f41e3c1d600, ac=2) at vcli_serve.c:255
#18 0x00000000004f0aee in cls_exec (cfd=0x7f41e3cc0800, av=0x7f41e3c1d600) at vcli_serve.c:320
#19 0x00000000004f1098 in cls_feed (cfd=0x7f41e3cc0800, 
    p=0x7ffcac728d52 "\noad-varnish_20230712071723_4427.1689146243.162252/vgc.so 1auto\nND__", 
    e=0x7ffcac728d53 "oad-varnish_20230712071723_4427.1689146243.162252/vgc.so 1auto\nND__") at vcli_serve.c:416
#20 0x00000000004f2183 in VCLS_Poll (cs=0x7f41e3cc5fb0, cli=0x7f41e3cc0830, timeout=-1) at vcli_serve.c:608
#21 0x0000000000426ab4 in CLI_Run () at cache/cache_cli.c:110
#22 0x00000000004485dd in child_main (sigmagic=1, altstksz=131072) at cache/cache_main.c:465
#23 0x000000000049cae7 in mgt_launch_child (cli=0x0) at mgt/mgt_child.c:408
#24 0x000000000049d98f in MCH_Start_Child () at mgt/mgt_child.c:703
#25 0x00000000004a44a8 in main (argc=56, argv=0x7ffcac72b0e8) at mgt/mgt_main.c:959

(gdb) thr 166
[Switching to thread 166 (Thread 0x7f415be42700 (LWP 4447))]
#0  0x00007f41e492254d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0  0x00007f41e492254d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f41e491de9b in _L_lock_883 () from /lib64/libpthread.so.0
#2  0x00007f41e491dd68 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x00000000004469e9 in Lck__Lock (lck=0x79b690 <vcl_mtx>, p=0x5260b0 <__func__.8910> "VRT_AddDirector", l=211)
    at cache/cache_lck.c:124
#4  0x0000000000474e94 in VRT_AddDirector (ctx=0x7f415be41a10, m=0x512b40 <vbe_methods>, priv=0x7f415cc12100, fmt=0x512ecf "%s")
    at cache/cache_vrt_vcl.c:211
#5  0x000000000041d04f in VRT_new_backend_clustered (ctx=0x7f415be41a10, vc=0x0, vrt=0x7f415be41930, via=0x7f41e3fed880)
    at cache/cache_backend.c:738
#6  0x000000000041d1a7 in VRT_new_backend (ctx=0x7f415be41a10, vrt=0x7f415be41930, via=0x7f41e3fed880) at cache/cache_backend.c:756
#7  0x00007f41c358b16d in ref_add (r=0x7f415ac12040, ctx=0x7f415be41a10) at vmod_dynamic.c:605
#8  dom_update (res=<optimized out>, now=<optimized out>, priv=<optimized out>, dom=0x7f41bd046900) at vmod_dynamic.c:716
#9  dom_lookup_thread (priv=0x7f41bd046900) at vmod_dynamic.c:796
#10 0x00007f41e491bea5 in start_thread () from /lib64/libpthread.so.0
#11 0x00007f41e4644b0d in clone () from /lib64/libc.so.6

So a backend.list CLI command checks the health state. dom_healthy encounters a delayed creation and blocks here:

https://github.com/nigoroll/libvmod-dynamic/blob/aa7919e76a3184d158a4d7f1698ad8925353d4b4/src/vmod_dynamic.c#L309-L310

while at the same time, creation of the backend is waiting for the cli_mtx.

So basically a similar problem as the one which we handled in 821ad8f5bb26f77f9f377cd19a60f810afbf65cc: The list callback can depend on the cli_mtx, also not indirectly.