varnishcache / varnish-cache

Varnish Cache source code repository
https://www.varnish-cache.org
Other
3.69k stars 377 forks source link

esi:includes should not try minimal responses #4022

Closed xcir closed 12 months ago

xcir commented 12 months ago

Expected Behavior

No panic.

Even if it is some kind of error, 503 VCL failed is preferable instead of panic.

Current Behavior

ubuntu@px02:/etc/varnish$ sudo varnishadm panic.show
Panic at: Fri, 17 Nov 2023 03:45:38 GMT
Wrong turn at cache/cache_esi_deliver.c:81:
esi:includes should not try minimal responses
version = varnish-7.4.2 revision cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, vrt api = 18.0
ident = Linux,5.15.0-52-generic,x86_64,-junix,-smalloc,-sfile,-sfile,-sfile,-sfile,-sdefault,-hcritbit,epoll
now = 8280.937522 (mono), 1700192737.416492 (real)
Backtrace:
  0x5649e5bb0b62: /usr/sbin/varnishd(+0x5cb62) [0x5649e5bb0b62]
  0x5649e5c30ad9: /usr/sbin/varnishd(VAS_Fail+0x49) [0x5649e5c30ad9]
  0x5649e5b9270c: /usr/sbin/varnishd(+0x3e70c) [0x5649e5b9270c]
  0x5649e5bb8ca9: /usr/sbin/varnishd(+0x64ca9) [0x5649e5bb8ca9]
  0x5649e5bb91c0: /usr/sbin/varnishd(CNT_Request+0xf0) [0x5649e5bb91c0]
  0x5649e5b94400: /usr/sbin/varnishd(+0x40400) [0x5649e5b94400]
  0x5649e5b9040c: /usr/sbin/varnishd(VDP_bytes+0x8c) [0x5649e5b9040c]
  0x5649e5c0cf23: /usr/sbin/varnishd(+0xb8f23) [0x5649e5c0cf23]
  0x5649e5b90bac: /usr/sbin/varnishd(VDP_DeliverObj+0x6c) [0x5649e5b90bac]
  0x5649e5be9a24: /usr/sbin/varnishd(h2_deliver+0x8b4) [0x5649e5be9a24]
  0x5649e5bb6f31: /usr/sbin/varnishd(+0x62f31) [0x5649e5bb6f31]
  0x5649e5bb91c0: /usr/sbin/varnishd(CNT_Request+0xf0) [0x5649e5bb91c0]
  0x5649e5beb95b: /usr/sbin/varnishd(h2_do_req+0x5b) [0x5649e5beb95b]
  0x5649e5bd87f2: /usr/sbin/varnishd(+0x847f2) [0x5649e5bd87f2]
  0x5649e5bd8dd8: /usr/sbin/varnishd(+0x84dd8) [0x5649e5bd8dd8]
  0x7fa6bb764ac3: /lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7fa6bb764ac3]
  0x7fa6bb7f6a40: /lib/x86_64-linux-gnu/libc.so.6(+0x126a40) [0x7fa6bb7f6a40]
argv = {
  [0] = \"/usr/sbin/varnishd\",
  [1] = \"-P\",
  [2] = \"/run/varnish/varnishd.pid\",
  [3] = \"-f\",
  [4] = \"/etc/varnish/default.vcl\",
  [5] = \"-a\",
  [6] = \":80\",
  [7] = \"-T\",
  [8] = \"****:6082\",
  [9] = \"-s\",
  [10] = \"memory=malloc,100M\",
  [11] = \"-s\",
  [12] = \"file=file,/var/lib/varnish/varnish_storage.bin,1G,8k\",
  [13] = \"-s\",
  [14] = \"file2=file,/var/lib/varnish/varnish_storage2.bin,1G,8k\",
  [15] = \"-s\",
  [16] = \"file2_img=file,/var/lib/varnish/varnish_storage2-img.bin,15G,16k\",
  [17] = \"-s\",
  [18] = \"file2_img_thumb=file,/var/lib/varnish/varnish_storage2-img-thumb.bin,15G,16k\",
  [19] = \"-a\",
  [20] = \"****:88,PROXY\",
  [21] = \"-p\",
  [22] = \"feature=+http2\",
  [23] = \"-p\",
  [24] = \"ping_interval=1\",
  [25] = \"-p\",
  [26] = \"vcc_err_unref=off\",
  [27] = \"-p\",
  [28] = \"nuke_limit=6000\",
  [29] = \"-p\",
  [30] = \"first_byte_timeout=300\",
  [31] = \"-p\",
  [32] = \"idle_send_timeout=300\",
  [33] = \"-p\",
  [34] = \"shortlived=4\",
  [35] = \"-p\",
  [36] = \"pipe_timeout=300\",
  [37] = \"-p\",
  [38] = \"feature=+esi_disable_xml_check\",
  [39] = \"-p\",
  [40] = \"thread_pool_timeout=150\",
  [41] = \"-p\",
  [42] = \"thread_pool_min=180\",
  [43] = \"-p\",
  [44] = \"thread_pool_max=2000\",
  [45] = \"-p\",
  [46] = \"thread_queue_limit=60\",
  [47] = \"-p\",
  [48] = \"h2_max_concurrent_streams=150\",
  [49] = \"-p\",
  [50] = \"between_bytes_timeout=120\",
  [51] = \"-p\",
  [52] = \"http_req_size=48k\",
  [53] = \"-p\",
  [54] = \"http_req_hdr_len=12k\",
  [55] = \"-p\",
  [56] = \"workspace_client=164k\",
  [57] = \"-p\",
  [58] = \"workspace_backend=164k\",
  [59] = \"-p\",
  [60] = \"http_max_hdr=128\",
}
pthread.self = 0x7fa6b232b640
pthread.name = (cache-worker)
pthread.attr = {
  guard = 4096,
  stack_bottom = 0x7fa6b2318000,
  stack_top = 0x7fa6b232c000,
  stack_size = 81920,
}
thr.req = 0x7f9ea8b61060 {
  vxid = 851990, transport = ESI_INCLUDE
  step = Req Step synth
  req_body = none,
  err_code = 503, err_reason = VCL failed,
  restarts = 0, esi_level = 1,
  vary_b = (nil), vary_e = (nil),
  d_ttl = -1.000000, d_grace = -1.000000,
  storage = (nil),
  director_hint = 0x7fa6bb0200d0 {
    cli_name = reload_20231117_034408_80643.ws03,
    admin_health = probe, changed = 1700192736.023445,
    type = backend {
      conn_pool = 0x7fa6bb047cc0 {
        ident = 0x99fa223d9285b8b9ff85d6569e4ff9eb2dfc4e5b50cf0905e8cc6f89b384bfa5,
        vrt_endpoint = 0x7fa6bb1679b0 {
          ipv4 = ****, port = 80,
        },
      },
      hosthdr = ****,
      n_conn = 1,
    },
  },
  sess = 0x7fa6b2f1a320 {
    fd = 54, vxid = 32794,
    t_open = 1700192737.339832,
    t_idle = 1700192737.403753,
    ws = 0x7fa6b2f1a360 {
      id = \"ses\",
      {s, f, r, e} = {0x7fa6b2f1a3b8, +264, (nil), +576},
    },
    transport = HTTP/2 {
      h2_sess = 0x7fa6b233d870 {
        refcnt = 2, bogosity = 0, error = (null)
        open_streams = 0, highest_stream = 1, goaway_last_stream = 0,
        local_settings = {0x1000, 0x1, 0x96, 0xffff, 0x4000, 0x7fffffff},
        remote_settings = {0x1000, 0x0, 0x1, 0x800000, 0x10000, 0x7fffffff},
        {rxf_len, rxf_type, rxf_flags, rxf_stream} = {4, 3, 0x0, 1},
        stream = 0x7f9ea8bc91d8 {
          id = 0, state = idle,
          h2_sess = 0x7fa6b233d870, scheduled = 0, error = (null),
          t_send = 0.000000, t_winupd = 0.000000,
          t_window = 8381959, r_window = 65535,
          rxbuf = NULL
        },
        stream = 0x7f9ea8b986d8 {
          id = 1, state = c-rem,
          h2_sess = 0x7fa6b233d870, scheduled = 1, error = CANCEL,
          t_send = 1700192737.375131, t_winupd = 0.000000,
          t_window = 8381959, r_window = 65535,
          rxbuf = NULL
        },
    }
    client = **** 58294 ****:88,
    local.endpoint = ****:88,
    local.socket = a1,
    local.ip = ****:88,
    remote.ip = ****:26312,
    server.ip = ****:443,
    client.ip = ****:58294,
  },
  worker = 0x7fa6b23294f0 {
    ws = 0x7fa6b2329578 {
      id = \"wrk\",
      {s, f, r, e} = {0x7fa6b2328a60, +0, (nil), +2040},
    },
    VCL::method = MISS,
    VCL::methods = {RECV, HASH, MISS},
  },
  ws = 0x7f9ea8b611c0 {
    id = \"req\",
    {s, f, r, e} = {0x7f9ea8b66d98, +64, (nil), +144032},
  },
  http_conn = 0x7f9ea8b66d10 {
    doclose = null(Not Closing)
    ws = NULL
    {rxbuf_b, rxbuf_e} = {(nil), (nil)},
    {pipeline_b, pipeline_e} = {(nil), (nil)},
    content_length = 0,
    body_status = NULL,
    first_byte_timeout = 0.000000,
    between_bytes_timeout = 0.000000,
  },
  http[req] = 0x7f9ea8b61260 {
    ws = 0x7f9ea8b611c0 {
      [Already dumped, see above]
    },
    hdrs {
      \"GET\",
      ***,
    },
  },
  http[resp] = 0x7f9ea8b623d0 {
    ws = 0x7f9ea8b611c0 {
      [Already dumped, see above]
    },
    hdrs {
      \"HTTP/1.1\",
      \"503\",
      \"VCL failed\",
      \"Date: Fri, 17 Nov 2023 03:45:37 GMT\",
      \"Server: Varnish\",
      \"X-Varnish: 851990\",
    },
  },
  vdc = 0x7f9ea8b66cd0 {
    .magic = 0x00000000 EXPECTED: VDP_CTX_MAGIC=0xee501df7
  }
  vcl[vcl] = 0x7fa6bb04a9c0 {
    name = \"reload_20231117_034408_80643\",
    busy = 20,
    discard = 0,
    state = auto,
    temp = warm,
    conf = 0x7fa397de5d00 {
      syntax = \"41\",
      srcname = {
        [0] = \"/etc/varnish/default.vcl\",
        [1] = \"/etc/varnish/health.vcl\",
        [2] = \"/etc/varnish/banip.vcl\",
        [3] = \"/etc/varnish/define.vcl\",
        [4] = \"/etc/varnish/auth.vcl\",
        [5] = \"/etc/varnish/prefilter.vcl\",
        [6] = \"/etc/varnish/ext.vcl\",
        [7] = \"/etc/varnish/***.vcl\",
        [8] = \"/etc/varnish/***.vcl\",
        [9] = \"/etc/varnish/***.vcl\",
        [10] = \"/etc/varnish/***.vcl\",
        [11] = \"/etc/varnish/main.vcl\",
        [12] = \"<builtin>\",
      },
      instances = {
        ***
      },
    },
  },
  vpi = 0x7fa6b2328a50 {
    handling (VCL::return) = 0x4 (fail),
    ref = 536,
    vpi_ref = 0x7fa397de5940 {
      source = 12 (\"<builtin>\"),
      offset = 3675,
      line = 155,
      pos = 1,
      src = \"}\"
    },
  },
  flags = {
    req_reset,
  },
  privs = 0x7f9ea8b61250 {
  },
  top = 0x7f9ea8b986b8 {
    req = 0x7f9ea8b929a0 {
      vxid = 32795, transport = HTTP/2
      step = Req Step transmit
      req_body = none,
      restarts = 0, esi_level = 0,
      vary_b = (nil), vary_e = (nil),
      d_ttl = -1.000000, d_grace = -1.000000,
      storage = (nil),
      director_hint = 0x7fa6b51f52d0 {
        cli_name = reload_20231117_034408_80643.ws_rand,
        admin_health = probe, changed = 1700192736.037162,
        type = random {
        },
      },
      sess = 0x7fa6b2f1a320 {
        [Already dumped, see above]
      },
      worker = 0x7fa6b23294f0 {
        [Already dumped, see above]
      },
      ws = 0x7f9ea8b92b00 {
        id = \"req\",
        {s, f, r, e} = {0x7f9ea8b986d8, +1944, (nil), +144032},
      },
      http_conn = 0x7f9ea8b98650 {
        doclose = null(Not Closing)
        ws = NULL
        {rxbuf_b, rxbuf_e} = {(nil), (nil)},
        {pipeline_b, pipeline_e} = {(nil), (nil)},
        content_length = 0,
        body_status = NULL,
        first_byte_timeout = 0.000000,
        between_bytes_timeout = 0.000000,
      },
      http[req] = 0x7f9ea8b92ba0 {
        ws = 0x7f9ea8b92b00 {
          [Already dumped, see above]
        },
        hdrs {
          \"GET\",
          ***
        },
      },
      http[resp] = 0x7f9ea8b93d10 {
        ws = 0x7f9ea8b92b00 {
          [Already dumped, see above]
        },
        hdrs {
          \"HTTP/1.1\",
          \"200\",
          \"OK\",
          ***
          \"cache-control: no-store, no-cache, must-revalidate, private\",
        },
      },
      vdc = 0x7f9ea8b98610 {
        nxt = 0x7f9ea8b98e38,
        retval = 0,
        filters = {
          esi = 0x7f9ea8b98e00 { priv = 0x7fa6b5216c00 }
          H2B = 0x7f9ea8b98e38 { priv = 0x7f9ea8b986d8 }
        },
      },
      vcl[vcl] = 0x7fa6bb04a9c0 {
        [Already dumped, see above]
      },
      vpi = 0x7fa6b2328a50 {
        [Already dumped, see above]
      },
      objcore[REQ] = 0x7f9ea993e000 {
        refcnt = 1,
        flags = {hfm, private},
        exp_flags = {},
        exp = {1700192737.373705, 180.000000, 3600.000000, 0.000000},
        objhead = 0x7fa6bb0d9150,
        stevedore = 0x7fa6bb03f700 (malloc Transient) {
          Simple = 0x7f9ea9969000,
          Obj = 0x7fa6b521ff18 {priv=0x7fa6b521ff10, ptr=0x7f9ea9969000, len=752, space=752},
          LEN = 0x000000000000158c,
          VXID = 0x00000000000d0001,
          FLAGS = 0x16,
          GZIPBITS = 0x0000000000000050000000000000ac10000000000000ac1a00000000000027e8,
          LASTMODIFIED = 0x41d955b878400000,
          VARY = {len=0, ptr=(nil)},
          HEADERS = {len=624, ptr=0x7f9ea9969080},
          ESIDATA = 0x7f9ea97ea058 {priv=0x7f9ea97ea050, ptr=0x7f9ea9739800, len=1308, space=1308},
          Body = 0x7f9ea97ea0a8 {priv=0x7f9ea97ea0a0, ptr=0x7fa6bb0fd400, len=2091, space=2091},
        },
      },
      flags = {
      },
      privs = 0x7f9ea8b92b90 {
        priv = 0x7f9ea8b98c40 {{p (nil) l 0 m (nil) t \"\"} vmod 7fa397de7d48},
        priv = 0x7f9ea8b98960 {{p 0x7fa6b521a980 l 0 m 0x7fa6adaa1d30 t \"vmod_var_priv_task\"} vmod 7fa397de8048},
      },
      top = 0x7f9ea8b986b8 {
        [Already dumped, see above]
      },
    },
    privs = 0x7f9ea8b986d0 {
    },
    vcl[vcl0] = NULL
  },
},
thr.busyobj = NULL
thr.worker = 0x7fa6b23294f0 {
  [Already dumped, see above]
},
vmods = {
  std = {0x7fa6bb0d91c0, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 0.0},
  directors = {0x7fa6bb0d9230, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 0.0},
  vsthrottle = {0x7fa6bb0d9310, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 18.0},
  tcp = {0x7fa6bb0d93f0, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 0.0},
  digest = {0x7fa6bb0d9460, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 0.0},
  xcounter = {0x7fa6bb0d9540, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 18.0},
  accept = {0x7fa6bb0d9620, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 0.0},
  xkey = {0x7fa6bb0d9700, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 0.0},
  var = {0x7fa6bb0d97e0, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 18.0},
  cookie = {0x7fa6bb0d9850, Varnish 7.4.2 cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80, 0.0},
},
pools = {
  pool = 0x7fa6b3c00000 {
    nidle = 167,
    nthr = 180,
    lqueue = 0
  },
  pool = 0x7fa6b3c00640 {
    nidle = 167,
    nthr = 180,
    lqueue = 0
  },
},

varnishstat -1 https://gist.github.com/xcir/767063b6d64bf86eb9e3c5fa64453200

Possible Solution

No response

Steps to Reproduce (for bugs)

No response

Context

I have raised the version from 7.3.0 to 7.4.2 on a proxy that uses ESI and it caused a panic. I think it is some kind of race condition because nothing happened when I looked at the panicked URL again, but I can't think of the cause at all at the moment. The frequency was relatively high, and the panics occurred within a few minutes.

image

Varnish Cache version

varnishd (varnish-7.4.2 revision cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80)

Operating system

ubuntu22.04.3

Source of binary packages used (if any)

No response

xcir commented 12 months ago

I set vcl_req_reset feature to disable and it looks like it no longer causes panics. (I'm still waiting to see what happens.) Looking at some of the panics, it appears that the error occurs when the H2 reset(error=CANCEL) is issued and includes ESI processing.

dridi commented 12 months ago

Can you try an explicit return (fail) from an ESI task?

xcir commented 12 months ago

created vtc. https://gist.github.com/xcir/5d6df8e2a82f3d5e60505b710c0e3b47

ubuntu@proxy04:~/vtc$ diff esi.vtc esi.p.vtc 
5c5
< #varnish v1 -cliok "param.set feature -vcl_req_reset"
---
> varnish v1 -cliok "param.set feature -vcl_req_reset"

###+vcl_req_reset
ubuntu@proxy04:~/vtc$ sudo varnishtest esi.vtc |tail -n1
#    top  TEST esi.vtc FAILED (1.178) exit=2

###-vcl_req_reset
ubuntu@proxy04:~/vtc$ sudo varnishtest esi.p.vtc 
#    top  TEST esi.p.vtc passed (1.221)

remove this comment out, it works fine.

xcir commented 12 months ago

@Dridi Sorry, I missed it. If the VTC is not enough, I will do it.

dridi commented 12 months ago

Simpler reproducer:

--- i/bin/varnishtest/tests/e00003.vtc
+++ w/bin/varnishtest/tests/e00003.vtc
@@ -21,11 +21,13 @@ server s1 {

 varnish v1 -vcl+backend {
        sub vcl_synth {
+               if (req.esi_level > 0) { return (fail); }
                set resp.body = """
                """;
                return (deliver);
        }
        sub vcl_recv {
+               if (req.esi_level > 0) { return (fail); }
                if (req.esi_level > 0) {
                        set req.url = req.url + req.esi_level;
                        if (req.url ~ "^/synth") {

When the circuit breaker kicks in, it emulates a return (fail) that goes first to vcl_synth but failing ESI from vcl_synth will panic. The vcl_req_reset flag exposed that since it will act as a circuit breaker again before entering vcl_synth.

dridi commented 12 months ago

Here is a patch that went in Varnish Enterprise for a different reason due to the parallel nature of ESI there:

--- i/bin/varnishd/cache/cache_req_fsm.c
+++ w/bin/varnishd/cache/cache_req_fsm.c
@@ -333,7 +333,11 @@ cnt_synth(struct worker *wrk, struct req *req)

        VSLb_ts_req(req, "Process", W_TIM_real(wrk));

-       if (wrk->vpi->handling == VCL_RET_FAIL) {
+       while (wrk->vpi->handling == VCL_RET_FAIL) {
+               if (req->esi_level > 0) {
+                       wrk->vpi->handling = VCL_RET_DELIVER;
+                       break;
+               }
                VSB_destroy(&synth_body);
                (void)VRB_Ignore(req);
                (void)req->transport->minimal_response(req, 500);

I overlooked it while porting the patch series here based on what this was solving according to the commit message. I did not realize the problem with parallel execution of ESI was hiding another problem with serial ESI.

dridi commented 12 months ago

Bugwash: push patch with test coverage.