lpereira / lwan

Experimental, scalable, high performance HTTP server
https://lwan.ws
GNU General Public License v2.0
5.94k stars 548 forks source link

Unexpected socket closing, round#2 #348

Open pontscho opened 2 years ago

pontscho commented 2 years ago

Hi,

I found an other weird behaviour in Lwan when I started to use our server for serving video streams for masses. After a point, when the client drop the connection, for example it had a network issue, it's connection stuck in the queue and start closing its socket after that connection was dropped.

Steps for reproducing:

1) Please apply this patch:

diff --git a/src/lib/lwan-thread.c b/src/lib/lwan-thread.c
index 797acf98..db242a73 100644
--- a/src/lib/lwan-thread.c
+++ b/src/lib/lwan-thread.c
@@ -940,6 +940,7 @@ static void *thread_io_loop(void *data)

             if (UNLIKELY(event->events & (EPOLLRDHUP | EPOLLHUP))) {
                 if ((conn->flags & CONN_AWAITED_FD) != CONN_SUSPENDED) {
+                    lwan_status_info("HUP fd: %d flags: %x, %d", lwan_connection_get_fd(lwan, conn), conn->flags, (conn->flags & CONN_AWAITED_FD) != CONN_SUSPENDED);
                     timeout_queue_expire(&tq, conn);
                     continue;
                 }
diff --git a/src/lib/lwan-tq.c b/src/lib/lwan-tq.c
index 102443c1..f4e36a8d 100644
--- a/src/lib/lwan-tq.c
+++ b/src/lib/lwan-tq.c
@@ -94,7 +94,9 @@ void timeout_queue_expire(struct timeout_queue *tq,
         conn->coro = NULL;
     }

-    close(lwan_connection_get_fd(tq->lwan, conn));
+    if (close(lwan_connection_get_fd(tq->lwan, conn))) {
+        lwan_status_info("close error, fd: %d", lwan_connection_get_fd(tq->lwan, conn));
+    }
 }

 void timeout_queue_expire_waiting(struct timeout_queue *tq)
diff --git a/src/lib/lwan.c b/src/lib/lwan.c
index 9fd66e68..0a7db006 100644
--- a/src/lib/lwan.c
+++ b/src/lib/lwan.c
@@ -54,7 +54,7 @@ clockid_t monotonic_clock_id = CLOCK_MONOTONIC;

 static const struct lwan_config default_config = {
     .listener = "localhost:8080",
-    .keep_alive_timeout = 15,
+    .keep_alive_timeout = 2,
     .quiet = false,
     .proxy_protocol = false,
     .allow_cors = false,
diff --git a/src/samples/hello/main.c b/src/samples/hello/main.c
index 16e68bea..b7dc6720 100644
--- a/src/samples/hello/main.c
+++ b/src/samples/hello/main.c
@@ -26,6 +26,9 @@ LWAN_HANDLER_ROUTE(hello_world, "/")
     response->mime_type = "text/plain";
     lwan_strbuf_set_static(response->buffer, message, sizeof(message) - 1);

+    while (1)
+        lwan_request_sleep(request, 10);
+
     return HTTP_OK;
 }

2) Compile Lwan and please start hello sample app. 3) Please compile and run this small test application:


#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>

#define __USE_LARGEFILE64
#include <sys/resource.h>

static const char request[] =
    "GET / HTTP/1.1\r\n"
    "Accept: */*\r\n"
    "Accept-Encoding: gzip, deflate, br\r\n"
    "Accept-Language: en-US,en;q=0.5\r\n"
    "Cache-Control: no-cache\r\n"
    "Connection: keep-alive, Upgrade\r\n"
    "Pragma: no-cache\r\n"
    "Sec-Fetch-Dest: websocket\r\n"
    "Sec-Fetch-Mode: websocket\r\n"
    "Sec-Fetch-Site: cross-site\r\n"
    "Sec-WebSocket-Extensions: permessage-deflate\r\n"
    "Sec-WebSocket-Key: ui27BbCABJn5wWU2Jc1Ejw==\r\n"
    "Sec-WebSocket-Version: 13\r\n"
    "Upgrade: websocket\r\n"
    "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0\r\n"
    "\r\n"
;

int main(void)
{
    struct rlimit l = {
        .rlim_cur = 8192,
        .rlim_max = 8192,
    };
    setrlimit(RLIMIT_NOFILE, &l);

    for (int i = 0;i < 5000;i++) {
        int fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0);

        struct sockaddr_in addr = {
            .sin_addr.s_addr = inet_addr("127.0.0.1"),
            .sin_family = AF_INET,
            .sin_port = htons(8080),
        };

        connect(fd, (struct sockaddr *)&addr , sizeof(addr));
        send(fd, request, sizeof(request) - 1, 0);
    }

    sleep(3);
    abort();

    return 0;
}

Server output will be something like this:

lwan/build$ ./src/samples/hello/hello
2502216 lwan-job.c:101 lwan_job_thread_init() Initializing low priority job thread
2502216 lwan-tables.c:46 lwan_tables_init() Uncompressing MIME type table: 8489->31379 bytes, 957 entries
2502216 lwan.c:594 setup_from_config() Loading configuration file: hello.conf
2502216 lwan-config.c:803 config_open_path() Could not open configuration file: hello.conf: No such file or directory (error number 2)
2502216 lwan-response.c:77 lwan_response_init() Initializing default response
2502216 lwan.c:842 lwan_init_with_config() Initializing lwan web server
2502216 lwan.c:868 lwan_init_with_config() Using 16 threads, maximum 65536 sockets per thread
2502216 lwan-readahead.c:186 lwan_readahead_init() Initializing low priority readahead thread
2502216 lwan-thread.c:1257 lwan_thread_init() Initializing threads
2502216 lwan-thread.c:1275 lwan_thread_init() 16 CPUs of 16 are online. Reading topology to pre-schedule clients
2502218 lwan-thread.c:901 thread_io_loop() Worker thread #1 starting
2502216 lwan-socket.c:206 listen_addrinfo() Listening on http://127.0.0.1:8080
2502219 lwan-thread.c:901 thread_io_loop() Worker thread #2 starting
2502220 lwan-thread.c:901 thread_io_loop() Worker thread #3 starting
2502221 lwan-thread.c:901 thread_io_loop() Worker thread #4 starting
2502222 lwan-thread.c:901 thread_io_loop() Worker thread #5 starting
2502223 lwan-thread.c:901 thread_io_loop() Worker thread #6 starting
2502224 lwan-thread.c:901 thread_io_loop() Worker thread #7 starting
2502225 lwan-thread.c:901 thread_io_loop() Worker thread #8 starting
2502226 lwan-thread.c:901 thread_io_loop() Worker thread #9 starting
2502227 lwan-thread.c:901 thread_io_loop() Worker thread #10 starting
2502228 lwan-thread.c:901 thread_io_loop() Worker thread #11 starting
2502229 lwan-thread.c:901 thread_io_loop() Worker thread #12 starting
2502230 lwan-thread.c:901 thread_io_loop() Worker thread #13 starting
2502231 lwan-thread.c:901 thread_io_loop() Worker thread #14 starting
2502232 lwan-thread.c:901 thread_io_loop() Worker thread #15 starting
2502233 lwan-thread.c:901 thread_io_loop() Worker thread #16 starting
2502216 lwan-thread.c:1360 lwan_thread_init() Worker threads created and ready to serve
2502216 lwan.c:909 lwan_main_loop() Ready to serve
2502224 lwan-thread.c:943 thread_io_loop() HUP fd: 1414 flags: 64, 1
2502227 lwan-thread.c:943 thread_io_loop() HUP fd: 1401 flags: 64, 1
...
2512259 lwan-thread.c:943 thread_io_loop() HUP fd: 87 flags: 46, 1
2512259 lwan-thread.c:943 thread_io_loop() HUP fd: 71 flags: 46, 1
2512259 lwan-thread.c:943 thread_io_loop() HUP fd: 55 flags: 46, 1

This means connections were closed by test application abort well. Until this everything went fine.

After keep-alive timeout reached Lwan starts sending these messages:

2512261 lwan-tq.c:100 timeout_queue_expire() close error, fd: 889
2512261 lwan-tq.c:100 timeout_queue_expire() close error, fd: 73
2512261 lwan-tq.c:100 timeout_queue_expire() close error, fd: 41
...

Unfortunately this means sometimes that close() in the timeout_queue_expire() function hit another valid connection was created by another http request.

After I reverted back this commit (https://github.com/lpereira/lwan/commit/7a8a0eb7023122ad9605627c45af03b78fca44c4) everything get back to normal.

Thanks, pontscho

lpereira commented 1 year ago

I'm sorry, I've only seen this issue today. This is indeed really odd, thanks for reporting! I'll see if I can reproduce this.

lpereira commented 1 year ago

I'm not able to reproduce this here on Linux. Are you running the server on macOS?

pontscho commented 1 year ago

Hi,

no problem, I also haven't enough time for this problem.

Today I tested it with latest master, it's still available. I used Ubuntu, usually I use this distribution to getting closer to the release time.

Last log:

...
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
2754467 lwan-tq.c:100 timeout_queue_expire() close error, fd: 266
^C
~/lwan/build/src/samples/hello$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 20.04.3 LTS
Release:    20.04
Codename:   focal
~/lwan/build/src/samples/hello$ date
Fri Dec  2 11:22:20 CET 2022
~/lwan/build/src/samples/hello$ lscpu
Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   40 bits physical, 48 bits virtual
CPU(s):                          16
On-line CPU(s) list:             0-15
Thread(s) per core:              2
Core(s) per socket:              1
Socket(s):                       8
NUMA node(s):                    1
Vendor ID:                       AuthenticAMD
CPU family:                      23
Model:                           1
Model name:                      AMD EPYC Processor (with IBPB)
Stepping:                        2
CPU MHz:                         2999.998
BogoMIPS:                        5999.99
L1d cache:                       256 KiB
L1i cache:                       512 KiB
L2 cache:                        4 MiB
L3 cache:                        16 MiB
NUMA node0 CPU(s):               0-15
Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm rep_good nopl cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 arat npt nrip_save