apache / trafficserver

Apache Traffic Server™ is a fast, scalable and extensible HTTP/1.1 and HTTP/2 compliant caching proxy server.
https://trafficserver.apache.org/
Apache License 2.0
1.82k stars 804 forks source link

H3 crash HQSession::~HQSession #9783

Closed bryancall closed 8 months ago

bryancall commented 1 year ago
(gdb) bt full
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
        set = {__val = {0, 11014569033467807663, 3253139319627950568, 173988353035902449, 1, 13979173243358019584, 579005069656919567, 283686952306183, 9006311663777206111,
            4383123015962362557, 7951664624561075015, 985160869559618686, 0, 0, 0, 0}}
        pid = <optimized out>
        tid = <optimized out>
        ret = <optimized out>
#1  0x00007f8305540859 in __GI_abort () at abort.c:79
        save_stage = 1
        act = {__sigaction_handler = {sa_handler = 0x0, sa_sigaction = 0x0}, sa_mask = {__val = {0 <repeats 16 times>}}, sa_flags = 0, sa_restorer = 0x7f82f4120420}
        sigs = {__val = {32, 0 <repeats 15 times>}}
#2  0x00007f83062fdff1 in ink_abort (message_format=0x7f830636b040 "%s:%d: failed assertion `%s`") at ink_error.cc:99
        ap = {{gp_offset = 32, fp_offset = 48, overflow_arg_area = 0x7f83028a7d80, reg_save_area = 0x7f83028a7cc0}}
#3  0x00007f83062f8e9d in _ink_assert (expression=0x5581803b5240 "this->_transaction_list.head == nullptr", file=0x5581803b5230 "Http3Session.cc", line=42) at ink_assert.cc:37
No locals.
#4  0x000055818029739c in HQSession::~HQSession (this=0x7f82f40e93c0, __vtt_parm=0x5581805552e0 <VTT for Http3Session+8>, __in_chrg=<optimized out>) at Http3Session.cc:42
No locals.
#5  0x0000558180297d30 in Http3Session::~Http3Session (this=0x7f82f40e93c0, __in_chrg=<optimized out>, __vtt_parm=<optimized out>) at Http3Session.cc:179
No locals.
#6  0x0000558180297d7c in Http3Session::~Http3Session (this=0x7f82f40e93c0, __in_chrg=<optimized out>, __vtt_parm=<optimized out>) at Http3Session.cc:184
No locals.
#7  0x000055818029065c in Http3App::~Http3App (this=0x7f82f41249a0, __in_chrg=<optimized out>) at Http3App.cc:65
No locals.
#8  0x000055818029071e in Http3App::~Http3App (this=0x7f82f41249a0, __in_chrg=<optimized out>) at Http3App.cc:68
No locals.
#9  0x0000558180218484 in QUICApplicationMap::~QUICApplicationMap (this=0x7f82f4120420, __in_chrg=<optimized out>) at quic/QUICApplicationMap.h:41
        it = <optimized out>
#10 0x0000558180214cb2 in QUICNetVConnection::free (this=0x7f82980ae570, t=0x5581827d0840) at QUICNetVConnection_quiche.cc:118
        __FUNCTION__ = "free"
#11 0x00005581801ecca8 in NetHandler::free_netevent (this=0x5581827d4c40, ne=0x7f82980ae758) at UnixNet.cc:357
        t = 0x5581827d0840
#12 0x00005581801f02bc in InactivityCop::check_inactivity (this=0x7f82f40a0c90, event=2, e=0x7f82f40a0d20) at UnixNet.cc:74
        ne = 0x7f82980ae758
        lock = {m = {m_ptr = 0x7f829be01aa0}, lock_acquired = true}
        now = 1686069249361633384
        nh = @0x5581827d4c40: {<Continuation> = {<force_VFPT_to_top> = {_vptr.force_VFPT_to_top = 0x558180546870 <vtable for NetHandler+16>},
            handler = (int (Continuation::*)(Continuation * const, int, void *)) 0x5581801ed02c <NetHandler::mainNetEvent(int, Event*)>,
            handler_name = 0x558180388d14 "&NetHandler::mainNetEvent", mutex = {m_ptr = 0x7f82f40a0c10}, link = {<SLink<Continuation>> = {next = 0x0}, prev = 0x0},
            control_flags = {raw_flags = 0}, thread_affinity = 0x0}, <EThread::LoopTailHandler> = {_vptr.LoopTailHandler = 0x5581805468a0 <vtable for NetHandler+64>},
          thread = 0x5581827d0840, trigger_event = 0x0, read_ready_list = {<DLL<NetEvent, NetEvent::Link_read_ready_link>> = {head = 0x0}, tail = 0x0},
          write_ready_list = {<DLL<NetEvent, NetEvent::Link_write_ready_link>> = {head = 0x0}, tail = 0x0}, open_list = {<DLL<NetEvent, NetEvent::Link_open_link>> = {
              head = 0x7f82f53ab298}, tail = 0x7f82980a8328}, cop_list = {head = 0x0}, read_enable_list = {al = {head = {s = {pointer = 0x1, version = 117},
                data = 2158269056624017539073}, name = 0x558180389583 "AtomicSLL", offset = 136}}, write_enable_list = {al = {head = {s = {pointer = 0x1, version = 0},
                data = 1}, name = 0x558180389583 "AtomicSLL", offset = 248}}, keep_alive_queue = {<DLL<NetEvent, NetEvent::Link_keep_alive_queue_link>> = {
              head = 0x7f82f53b2fa8}, tail = 0x7f82f53ab298}, keep_alive_queue_size = 4, active_queue = {<DLL<NetEvent, NetEvent::Link_active_queue_link>> = {head = 0x0},
            tail = 0x0}, active_queue_size = 0, uring_evio = {fd = -1, event_loop = 0x0, syscall = true, type = 0, data = {untyped = 0x0, ne = 0x0, dnscon = 0x0, na = 0x0,
              uc = 0x0, dh = 0x0}}, static global_config = {max_connections_in = 30000, max_requests_in = 0, inactive_threshold_in = 0,
            transaction_no_activity_timeout_in = 0, keep_alive_no_activity_timeout_in = 0, default_inactivity_timeout = 86400}, config = {max_connections_in = 30000,
            max_requests_in = 0, inactive_threshold_in = 0, transaction_no_activity_timeout_in = 0, keep_alive_no_activity_timeout_in = 0,
            default_inactivity_timeout = 86400}, max_connections_per_thread_in = 3750, max_requests_per_thread_in = 0, static CONFIG_ITEM_COUNT = 6,
          static config_value_affects_per_thread_value = {<std::_Base_bitset<1>> = {_M_w = 3}, <No data fields>}, static active_thread_types = {<std::_Base_bitset<1>> = {
              _M_w = 5}, <No data fields>}}
        __FUNCTION__ = "check_inactivity"
maskit commented 1 year ago

H3Transaction removes itself from _transaction_list by calling H3Session::remove_transaction in its destructor, but the destructor is not called because the transaction is still on going (i.e. HttpSM has not called transaction_done) although there's no traffic.

We need to stop the transaction and delete the H3Transaction instance first if inactivity timeout kicks in.

bryancall commented 1 year ago

image

maskit commented 1 year ago

I looked into this a bit more. My comment above was not very accurate (the abort happens even if transaction_done() is called by HttpSM).

There are (at least) two issues around this. When inactive timeout happens on a QUIC connection:

  1. The event is not propagated to streams on the connection due to lack of implementation, and transactions keep running without knowing connection close due to timeout. This causes the abort. RESET_STREAM frame, which closes one stream, can cause a similar result too.
  2. The event is not propagated to the session because there is no path to propagate events from QUICNetVC to Http3App/Session. Although closing streams/transactions at the destructors is doable, that would be the last resort if we have no idea why transactions are not closed. When inactive timeout happens, Http3App/Session should notify the event to all transactions.

On H1 and H2, their session directly interacts SSLNetVC, so they call SSLNetVC::do_io_read/write and that establishes a way to propagate events. On H3, H3App/Session cannot directly read/write data from/to QUICNetVC (everything is done on streams), so it seemed like there's no need to call do_io_read/write. But we probably need to call them to receive events.

duke8253 commented 11 months ago

Just documenting how I regenerated this crash.

I'm running h2load with the following command ./bin/h2load -n 10000 -c 100 -m 10 --npn-list=h3 https://127.0.0.1:9090. where 9090 is the port I set for QUIC. This command should return fail for all requests, since the path isn't specified like it should for the generator.so plugin. ATS crashes after a while with the following backtrace.

#0  0x000014c5bbf38acf in raise () from /lib64/libc.so.6
#1  0x000014c5bbf0bea5 in abort () from /lib64/libc.so.6
#2  0x0000000000913ffe in ink_abort (message_format=0xd570bc "%s:%d: failed assertion `%s`") at /home/feid/git/duke-ats/src/tscore/ink_error.cc:99
#3  0x000000000091197d in _ink_assert (expression=0xe5d020 "this->_transaction_list.head == nullptr",
    file=0xe5cfe8 "/home/feid/git/duke-ats/src/proxy/http3/Http3Session.cc", line=42) at /home/feid/git/duke-ats/src/tscore/ink_assert.cc:37
#4  0x0000000000d08832 in HQSession::~HQSession (this=0x14c59411bb00, __vtt_parm=0xe5d940 <VTT for Http3Session+8>, __in_chrg=<optimized out>)
    at /home/feid/git/duke-ats/src/proxy/http3/Http3Session.cc:42
#5  0x0000000000d090c4 in Http3Session::~Http3Session (this=0x14c59411bb00, __in_chrg=<optimized out>, __vtt_parm=<optimized out>)
    at /home/feid/git/duke-ats/src/proxy/http3/Http3Session.cc:180
#6  0x0000000000d09104 in Http3Session::~Http3Session (this=0x14c59411bb00, __in_chrg=<optimized out>, __vtt_parm=<optimized out>)
    at /home/feid/git/duke-ats/src/proxy/http3/Http3Session.cc:185
#7  0x0000000000d02e84 in Http3App::~Http3App (this=0x14c594119190, __in_chrg=<optimized out>) at /home/feid/git/duke-ats/src/proxy/http3/Http3App.cc:71
#8  0x0000000000d02f42 in Http3App::~Http3App (this=0x14c594119190, __in_chrg=<optimized out>) at /home/feid/git/duke-ats/src/proxy/http3/Http3App.cc:74
#9  0x0000000000c79d27 in QUICApplicationMap::~QUICApplicationMap (this=0x14c594039b50, __in_chrg=<optimized out>)
    at /home/feid/git/duke-ats/include/iocore/net/quic/QUICApplicationMap.h:41
#10 0x0000000000c77408 in QUICNetVConnection::free_thread (this=0x14c51803fb50, t=0x14c5bad2d010) at /home/feid/git/duke-ats/src/iocore/net/QUICNetVConnection.cc:128
#11 0x0000000000c9dd7a in NetHandler::free_netevent (this=0x14c5bad2dba0, ne=0x14c51803fd70) at /home/feid/git/duke-ats/src/iocore/net/NetHandler.cc:228
#12 0x0000000000c6065a in InactivityCop::check_inactivity (this=0x14c594000c50, event=2, e=0x14c594000ce0) at /home/feid/git/duke-ats/src/iocore/net/UnixNet.cc:98
#13 0x00000000008e54b9 in Continuation::handleEvent (this=0x14c594000c50, event=2, data=0x14c594000ce0)
    at /home/feid/git/duke-ats/include/iocore/eventsystem/Continuation.h:228
#14 0x0000000000cdac60 in EThread::process_event (this=0x14c5bad2d010, e=0x14c594000ce0, calling_code=2) at /home/feid/git/duke-ats/src/iocore/eventsystem/UnixEThread.cc:150
#15 0x0000000000cdb26a in EThread::execute_regular (this=0x14c5bad2d010) at /home/feid/git/duke-ats/src/iocore/eventsystem/UnixEThread.cc:255
#16 0x0000000000cdb555 in EThread::execute (this=0x14c5bad2d010) at /home/feid/git/duke-ats/src/iocore/eventsystem/UnixEThread.cc:334
#17 0x0000000000cd9fa0 in spawn_thread_internal (a=0x2375600) at /home/feid/git/duke-ats/src/iocore/eventsystem/Thread.cc:78
#18 0x000014c5bc2b71ca in start_thread () from /lib64/libpthread.so.0
#19 0x000014c5bbf23e73 in clone () from /lib64/libc.so.6
brbzull0 commented 10 months ago

Adding a note here: I am seeing this assert in the CI quiche test. What called my attention is that we had this test passing before 7th November, and this issue date is earlier so, there was something checked in the code that made this issue raise in CI.