lesismal / nbio

Pure Go 1000k+ connections solution, support tls/http1.x/websocket and basically compatible with net/http, with high-performance and low memory cost, non-blocking, event-driven, easy-to-use.
MIT License
2.17k stars 153 forks source link

curious about the "total success" #410

Closed kolinfluence closed 5 months ago

kolinfluence commented 6 months ago

tried to write a tls client and testing rps but total success is only around 58000.

ran with default tls server why?

package main

import (
        "bytes"
        "fmt"
        "log"
        "runtime"
        "sync/atomic"
        "time"

        "github.com/lesismal/nbio"
        "github.com/lesismal/nbio/extension/tls"
)

var (
        qps   int64 = 0
        total int64 = 0

        tlsConfig = &tls.Config{
                InsecureSkipVerify: true,
        }
)

func main() {
        var (
                wbuf = []byte("hello world")
                addr = "localhost:8888"
        )

        g := nbio.NewEngine(nbio.Config{})

        isClient := true
        g.OnOpen(tls.WrapOpen(tlsConfig, isClient, func(c *nbio.Conn, tlsConn *tls.Conn) {
                log.Println("OnOpen:", c.RemoteAddr().String())
                // tlsConn.Write(wbuf)
        }))
        g.OnClose(tls.WrapClose(func(c *nbio.Conn, tlsConn *tls.Conn, err error) {
                log.Println("OnClose:", c.RemoteAddr().String())
        }))
        g.OnData(tls.WrapData(func(c *nbio.Conn, tlsConn *tls.Conn, data []byte) {
                if bytes.Equal(wbuf, data) {
                        tlsConn.Write(wbuf)
                        atomic.AddInt64(&qps, 1)
                } else {
                        c.Close()
                }
        }))

        err := g.Start()
        if err != nil {
                fmt.Printf("Start failed: %v\n", err)
        }
        defer g.Stop()

        for i := 0; i < 1; i++ {
                func() {
                        // step 1: make a tls.Conn by tls.Dial
                        tlsConn, err := tls.Dial("tcp", addr, tlsConfig)
                        if err != nil {
                                log.Fatalf("Dial failed: %v\n", err)
                        }
                        // step 2:
                        // add tls.Conn.conn to gopher, and get the nbio.Conn. the new nbio.Conn is non-blocking
                        nbConn, err := nbio.NBConn(tlsConn.Conn())
                        if err != nil {
                                log.Fatalf("AddConn failed: %v\n", err)
                        }
                        // step 3: set tls.Conn and nbio.Conn to each other, and add nbio.Conn to the gopher
                        isNonblock := true
                        nbConn.SetSession(tlsConn)
                        tlsConn.ResetConn(nbConn, isNonblock)
                        g.AddConn(nbConn)

                        // step 4: write data here or in the OnOpen handler or anywhere

                        start := time.Now()

                        for j:=0;j<1000000;j++ {
                                tlsConn.Write(wbuf)
                        }

                        elapsed := time.Since(start)
                        fmt.Printf("Executed requests in %s\n", elapsed)

                }()
        }

        ticker := time.NewTicker(time.Second)
        for i := 1; true; i++ {
                <-ticker.C
                nSuccess := atomic.SwapInt64(&qps, 0)
                total += nSuccess
                fmt.Printf("running for %v seconds, NumGoroutine: %v, success: %v, totalSuccess: %v\n", i, runtime.NumGoroutine(), nSuccess, total)
        }

        select {}
}
lesismal commented 6 months ago

I run it and it runs fine:

image

although it runs fine, it's not recommended to write a lot in for loop to a non-blocking conn like in your code:

for j:=0;j<1000000;j++ {
    tlsConn.Write(wbuf)
}
kolinfluence commented 6 months ago

@lesismal can u pls help check this again?

it doesnt show up like yours.

using ubuntu 22.04 golang 1.22

for the for loop, is there a better way to simulate it? i mean what's a proper way to "benchmark" this for production stress test?

server

package main

import (
        "log"

        "github.com/lesismal/llib/std/crypto/tls"
        "github.com/lesismal/nbio"
        ntls "github.com/lesismal/nbio/extension/tls"
)

func main() {
        cert, err := tls.X509KeyPair(rsaCertPEM, rsaKeyPEM)
        if err != nil {
                log.Fatalf("tls.X509KeyPair failed: %v", err)
        }
        tlsConfig := &tls.Config{
                Certificates:       []tls.Certificate{cert},
                InsecureSkipVerify: true,
        }

        g := nbio.NewEngine(nbio.Config{
                Network: "tcp",
                Addrs:   []string{"localhost:8888"},
        })
        isClient := false
        g.OnOpen(ntls.WrapOpen(tlsConfig, isClient, func(c *nbio.Conn, tlsConn *tls.Conn) {
                log.Println("OnOpen:", c.RemoteAddr().String())
        }))
        g.OnClose(ntls.WrapClose(func(c *nbio.Conn, tlsConn *tls.Conn, err error) {
                log.Println("OnClose:", c.RemoteAddr().String())
        }))
        g.OnData(ntls.WrapData(func(c *nbio.Conn, tlsConn *tls.Conn, data []byte) {
//              log.Println("OnData:", c.RemoteAddr().String(), string(data))
                tlsConn.Write(data)
        }))

        err = g.Start()
        if err != nil {
                log.Fatalf("nbio.Start failed: %v\n", err)
                return
        }
        defer g.Stop()

        g.Wait()
}

var rsaCertPEM = []byte(`-----BEGIN CERTIFICATE-----
MIIDazCCAlOgAwIBAgIUJeohtgk8nnt8ofratXJg7kUJsI4wDQYJKoZIhvcNAQEL
BQAwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDAeFw0yMDEyMDcwODIyNThaFw0zMDEy
MDUwODIyNThaMEUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw
HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQwggEiMA0GCSqGSIb3DQEB
AQUAA4IBDwAwggEKAoIBAQCy+ZrIvwwiZv4bPmvKx/637ltZLwfgh3ouiEaTchGu
IQltthkqINHxFBqqJg44TUGHWthlrq6moQuKnWNjIsEc6wSD1df43NWBLgdxbPP0
x4tAH9pIJU7TQqbznjDBhzRbUjVXBIcn7bNknY2+5t784pPF9H1v7h8GqTWpNH9l
cz/v+snoqm9HC+qlsFLa4A3X9l5v05F1uoBfUALlP6bWyjHAfctpiJkoB9Yw1TJa
gpq7E50kfttwfKNkkAZIbib10HugkMoQJAs2EsGkje98druIl8IXmuvBIF6nZHuM
lt3UIZjS9RwPPLXhRHt1P0mR7BoBcOjiHgtSEs7Wk+j7AgMBAAGjUzBRMB0GA1Ud
DgQWBBQdheJv73XSOhgMQtkwdYPnfO02+TAfBgNVHSMEGDAWgBQdheJv73XSOhgM
QtkwdYPnfO02+TAPBgNVHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBCwUAA4IBAQBf
SKVNMdmBpD9m53kCrguo9iKQqmhnI0WLkpdWszc/vBgtpOE5ENOfHGAufHZve871
2fzTXrgR0TF6UZWsQOqCm5Oh3URsCdXWewVMKgJ3DCii6QJ0MnhSFt6+xZE9C6Hi
WhcywgdR8t/JXKDam6miohW8Rum/IZo5HK9Jz/R9icKDGumcqoaPj/ONvY4EUwgB
irKKB7YgFogBmCtgi30beLVkXgk0GEcAf19lHHtX2Pv/lh3m34li1C9eBm1ca3kk
M2tcQtm1G89NROEjcG92cg+GX3GiWIjbI0jD1wnVy2LCOXMgOVbKfGfVKISFt0b1
DNn00G8C6ttLoGU2snyk
-----END CERTIFICATE-----
`)

var rsaKeyPEM = []byte(`-----BEGIN RSA PRIVATE KEY-----
MIIEogIBAAKCAQEAsvmayL8MImb+Gz5rysf+t+5bWS8H4Id6LohGk3IRriEJbbYZ
KiDR8RQaqiYOOE1Bh1rYZa6upqELip1jYyLBHOsEg9XX+NzVgS4HcWzz9MeLQB/a
SCVO00Km854wwYc0W1I1VwSHJ+2zZJ2Nvube/OKTxfR9b+4fBqk1qTR/ZXM/7/rJ
6KpvRwvqpbBS2uAN1/Zeb9ORdbqAX1AC5T+m1soxwH3LaYiZKAfWMNUyWoKauxOd
JH7bcHyjZJAGSG4m9dB7oJDKECQLNhLBpI3vfHa7iJfCF5rrwSBep2R7jJbd1CGY
0vUcDzy14UR7dT9JkewaAXDo4h4LUhLO1pPo+wIDAQABAoIBAF6yWwekrlL1k7Xu
jTI6J7hCUesaS1yt0iQUzuLtFBXCPS7jjuUPgIXCUWl9wUBhAC8SDjWe+6IGzAiH
xjKKDQuz/iuTVjbDAeTb6exF7b6yZieDswdBVjfJqHR2Wu3LEBTRpo9oQesKhkTS
aFF97rZ3XCD9f/FdWOU5Wr8wm8edFK0zGsZ2N6r57yf1N6ocKlGBLBZ0v1Sc5ShV
1PVAxeephQvwL5DrOgkArnuAzwRXwJQG78L0aldWY2q6xABQZQb5+ml7H/kyytef
i+uGo3jHKepVALHmdpCGr9Yv+yCElup+ekv6cPy8qcmMBqGMISL1i1FEONxLcKWp
GEJi6QECgYEA3ZPGMdUm3f2spdHn3C+/+xskQpz6efiPYpnqFys2TZD7j5OOnpcP
ftNokA5oEgETg9ExJQ8aOCykseDc/abHerYyGw6SQxmDbyBLmkZmp9O3iMv2N8Pb
Nrn9kQKSr6LXZ3gXzlrDvvRoYUlfWuLSxF4b4PYifkA5AfsdiKkj+5sCgYEAzseF
XDTRKHHJnzxZDDdHQcwA0G9agsNj64BGUEjsAGmDiDyqOZnIjDLRt0O2X3oiIE5S
TXySSEiIkxjfErVJMumLaIwqVvlS4pYKdQo1dkM7Jbt8wKRQdleRXOPPN7msoEUk
Ta9ZsftHVUknPqblz9Uthb5h+sRaxIaE1llqDiECgYATS4oHzuL6k9uT+Qpyzymt
qThoIJljQ7TgxjxvVhD9gjGV2CikQM1Vov1JBigj4Toc0XuxGXaUC7cv0kAMSpi2
Y+VLG+K6ux8J70sGHTlVRgeGfxRq2MBfLKUbGplBeDG/zeJs0tSW7VullSkblgL6
nKNa3LQ2QEt2k7KHswryHwKBgENDxk8bY1q7wTHKiNEffk+aFD25q4DUHMH0JWti
fVsY98+upFU+gG2S7oOmREJE0aser0lDl7Zp2fu34IEOdfRY4p+s0O0gB+Vrl5VB
L+j7r9bzaX6lNQN6MvA7ryHahZxRQaD/xLbQHgFRXbHUyvdTyo4yQ1821qwNclLk
HUrhAoGAUtjR3nPFR4TEHlpTSQQovS8QtGTnOi7s7EzzdPWmjHPATrdLhMA0ezPj
Mr+u5TRncZBIzAZtButlh1AHnpN/qO3P0c0Rbdep3XBc/82JWO8qdb5QvAkxga3X
BpA7MNLxiqss+rCbwf3NbWxEMiDQ2zRwVoafVFys7tjmv6t2Xck=
-----END RSA PRIVATE KEY-----
`)

client

package main

import (
        "bytes"
        "fmt"
        "log"
        "net/http"
        _ "net/http/pprof"
        "runtime"
        "sync/atomic"
        "time"

        "github.com/lesismal/nbio"
        "github.com/lesismal/nbio/extension/tls"
)

var (
        qps   int64 = 0
        total int64 = 0

        tlsConfig = &tls.Config{
                InsecureSkipVerify: true,
        }
)

func main() {
        go func() {
                log.Println(http.ListenAndServe("localhost:6060", nil))
        }()

        var (
                wbuf = []byte("hello")
                addr = "127.0.0.1:8888"
        )

        g := nbio.NewEngine(nbio.Config{})

        isClient := true
        g.OnOpen(tls.WrapOpen(tlsConfig, isClient, func(c *nbio.Conn, tlsConn *tls.Conn) {
                log.Println("OnOpen:", c.RemoteAddr().String())
                // tlsConn.Write(wbuf)
        }))
        g.OnClose(tls.WrapClose(func(c *nbio.Conn, tlsConn *tls.Conn, err error) {
                log.Println("OnClose:", c.RemoteAddr().String())
        }))
        g.OnData(tls.WrapData(func(c *nbio.Conn, tlsConn *tls.Conn, data []byte) {
                if bytes.Equal(wbuf, data) {
                        tlsConn.Write(wbuf)
                        atomic.AddInt64(&qps, 1)
                } else {
                        c.Close()
                }
        }))

        err := g.Start()
        if err != nil {
                fmt.Printf("Start failed: %v\n", err)
        }
        defer g.Stop()

        for i := 0; i < 1; i++ {
                func() {
                        // step 1: make a tls.Conn by tls.Dial
                        tlsConn, err := tls.Dial("tcp", addr, tlsConfig)
                        if err != nil {
                                log.Fatalf("Dial failed: %v\n", err)
                        }
                        // step 2:
                        // add tls.Conn.conn to gopher, and get the nbio.Conn. the new nbio.Conn is non-blocking
                        nbConn, err := nbio.NBConn(tlsConn.Conn())
                        if err != nil {
                                log.Fatalf("AddConn failed: %v\n", err)
                        }
                        // step 3: set tls.Conn and nbio.Conn to each other, and add nbio.Conn to the gopher
                        isNonblock := true
                        nbConn.SetSession(tlsConn)
                        tlsConn.ResetConn(nbConn, isNonblock)
                        g.AddConn(nbConn)

                        // step 4: write data here or in the OnOpen handler or anywhere

                        start := time.Now()

                        for j:=0;j<10000000;j++ {
                                tlsConn.Write(wbuf)
                        }

                        elapsed := time.Since(start)
                        fmt.Printf("Executed requests in %s\n", elapsed)

                }()
        }

        ticker := time.NewTicker(time.Second)
        for i := 1; true; i++ {
                <-ticker.C
                nSuccess := atomic.SwapInt64(&qps, 0)
                total += nSuccess
                fmt.Printf("running for %v seconds, NumGoroutine: %v, success: %v, totalSuccess: %v\n", i, runtime.NumGoroutine(), nSuccess, total)
        }

        select {}
}
kolinfluence commented 6 months ago

@lesismal this is my output

(base) root@ubuntu:/home/ubuntu/nbio-examples/tls/client# ./client
2024/03/24 03:18:43.080 [INF] NBIO Engine[NB] start
2024/03/24 03:18:43 OnOpen: 127.0.0.1:41700
2024/03/24 03:18:43 OnOpen: 127.0.0.1:8888
2024/03/24 03:18:45 OnClose: 127.0.0.1:41700
2024/03/24 03:18:45 OnClose: 127.0.0.1:8888
Executed requests in 2.6989956s
running for 1 seconds, NumGoroutine: 3, success: 68073, totalSuccess: 68073
running for 2 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 3 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 4 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 5 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 6 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 7 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 8 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 9 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 10 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 11 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 12 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
lesismal commented 6 months ago

For Echo benchmark, the client has had resend-logic here in my examples:

g.OnData(tls.WrapData(func(c *nbio.Conn, tlsConn *tls.Conn, data []byte) {
    if bytes.Equal(wbuf, data) {
        tlsConn.Write(wbuf)
        atomic.AddInt64(&qps, 1)
    } else {
        c.Close()
    }
}))

That means when the client recv 1 message, it will send another message to the server. So, you just need to send 1 message by yourself when it connected, and that's what I did in my origin example.

For more tests, you can implement it yourself.

lesismal commented 6 months ago

@lesismal this is my output

(base) root@ubuntu:/home/ubuntu/nbio-examples/tls/client# ./client
2024/03/24 03:18:43.080 [INF] NBIO Engine[NB] start
2024/03/24 03:18:43 OnOpen: 127.0.0.1:41700
2024/03/24 03:18:43 OnOpen: 127.0.0.1:8888
2024/03/24 03:18:45 OnClose: 127.0.0.1:41700
2024/03/24 03:18:45 OnClose: 127.0.0.1:8888
Executed requests in 2.6989956s
running for 1 seconds, NumGoroutine: 3, success: 68073, totalSuccess: 68073
running for 2 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 3 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 4 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 5 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 6 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 7 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 8 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 9 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 10 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 11 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073
running for 12 seconds, NumGoroutine: 3, success: 0, totalSuccess: 68073

please print the closed err and check why it was closed.

lesismal commented 6 months ago

I see there're two connections OnOpen and OnClose logs, are you using the client and server in the same process and using the same Engine?

kolinfluence commented 6 months ago

@lesismal both client and server are in two different folders like your example, i just replaced/modify the code and run go build that's all.

let me check why it's closed...

kolinfluence commented 6 months ago

@lesismal ./server

2024/03/24 03:44:46.033 [INF] NBIO Engine[NB] start listen on: ["tcp@127.0.0.1:8888"]
2024/03/24 03:45:01 OnOpen: 127.0.0.1:37288
2024/03/24 03:45:03 OnClose: 127.0.0.1:37288 invalid argument
./client
2024/03/24 03:45:01.025 [INF] NBIO Engine[NB] start
2024/03/24 03:45:01 OnOpen: 127.0.0.1:8888
2024/03/24 03:45:03 OnClose: 127.0.0.1:8888
Executed requests in 2.504090903s
running for 1 seconds, NumGoroutine: 3, success: 70173, totalSuccess: 70173
running for 2 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 3 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 4 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 5 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 6 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 7 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 8 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 9 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 10 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 11 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 12 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 13 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 14 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 15 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 16 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 17 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 18 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 19 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 20 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 21 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 22 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 23 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 24 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 25 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 26 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 27 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 28 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 29 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
running for 30 seconds, NumGoroutine: 3, success: 0, totalSuccess: 70173
lesismal commented 6 months ago

please just try nbio-examples/tls origin code, don't modify it and see whether it works. I run it and noting wrong, but on MacOS now.

kolinfluence commented 6 months ago

@lesismal u are right, my bad. let me check what's wrong. thx

kolinfluence commented 6 months ago

@lesismal found the issue, which is on client side. it's on the line z:=0; ..... z++ for loop. the example mentioned u can put "tlsConn.Write" anywhere... so i ran 1mil for loop to test...

i think i discovered a bug. high volumetric for loop sending, bytes.Equal(wbuf,data) <-- this line failed? not sure. can u help test? thx

        g.OnData(tls.WrapData(func(c *nbio.Conn, tlsConn *tls.Conn, data []byte) {
                if bytes.Equal(wbuf, data) {
                        tlsConn.Write(wbuf)
                        atomic.AddInt64(&qps, 1)
                } else {
                        c.Close()
                }
        }))
package main

import (
        "bytes"
        "fmt"
        "log"
        "runtime"
        "sync/atomic"
        "time"

        "github.com/lesismal/nbio"
        "github.com/lesismal/nbio/extension/tls"
)

var (
        qps   int64 = 0
        total int64 = 0

        tlsConfig = &tls.Config{
                InsecureSkipVerify: true,
        }
)

func main() {
        var (
                wbuf = []byte("hello world")
                addr = "localhost:8888"
        )

        g := nbio.NewEngine(nbio.Config{})

        isClient := true
        g.OnOpen(tls.WrapOpen(tlsConfig, isClient, func(c *nbio.Conn, tlsConn *tls.Conn) {
                log.Println("OnOpen:", c.RemoteAddr().String())
                // tlsConn.Write(wbuf)
        }))
        g.OnClose(tls.WrapClose(func(c *nbio.Conn, tlsConn *tls.Conn, err error) {
                log.Println("OnClose:", c.RemoteAddr().String())
        }))
        g.OnData(tls.WrapData(func(c *nbio.Conn, tlsConn *tls.Conn, data []byte) {
                if bytes.Equal(wbuf, data) {
                        tlsConn.Write(wbuf)
                        atomic.AddInt64(&qps, 1)
                } else {
                        c.Close()
                }
        }))

        err := g.Start()
        if err != nil {
                fmt.Printf("Start failed: %v\n", err)
        }
        defer g.Stop()

        for i := 0; i < 1; i++ {
                func() {
                        // step 1: make a tls.Conn by tls.Dial
                        tlsConn, err := tls.Dial("tcp", addr, tlsConfig)
                        if err != nil {
                                log.Fatalf("Dial failed: %v\n", err)
                        }
                        // step 2:
                        // add tls.Conn.conn to gopher, and get the nbio.Conn. the new nbio.Conn is non-blocking
                        nbConn, err := nbio.NBConn(tlsConn.Conn())
                        if err != nil {
                                log.Fatalf("AddConn failed: %v\n", err)
                        }
                        // step 3: set tls.Conn and nbio.Conn to each other, and add nbio.Conn to the gopher
                        isNonblock := true
                        nbConn.SetSession(tlsConn)
                        tlsConn.ResetConn(nbConn, isNonblock)
                        g.AddConn(nbConn)

                        // step 4: write data here or in the OnOpen handler or anywhere
//                      for z:=0;z<10000000;z++ { //uncomment this and it will not work... why?
                                tlsConn.Write(wbuf)
//                      }
                }()
        }

        ticker := time.NewTicker(time.Second)
        for i := 1; true; i++ {
                <-ticker.C
                nSuccess := atomic.SwapInt64(&qps, 0)
                total += nSuccess
                fmt.Printf("running for %v seconds, NumGoroutine: %v, success: %v, totalSuccess: %v\n", i, runtime.NumGoroutine(), nSuccess, total)
        }
}
lesismal commented 6 months ago

I ran it on ubuntu and reproduced the problem. It's because we write a lot and the TCP send queue is full, then the data is cached by nbio.Conn and wait for the fd till it's writable and write the cached data. Finally, when nbio calls syscall.Writev, the huge num of []byte or huge size of []byte make the syscall failed. I also tried to copy all the [][]byte to a single []byte then change the syscall.Writev to syscall.Write, but also failed because of the huge size of []byte.

Now I make a commit to add data flow control for the nbio.Conn flush, and it works fine on my ubuntu now. please try it:

go get -u github.com/lesismal/nbio@575223a3921b8c2b2359cdb5d205a11686380a66

Thanks a lot Bro!

lesismal commented 6 months ago

But please notice that we should still avoid such a lot of writing in production env. We can set Engine.MaxWriteBufferSize to limit the size nbio.Conn cached. If overflow of it, the nbio.Conn.Write fails.

kolinfluence commented 6 months ago

@lesismal i did this: go get -u github.com/lesismal/nbio@575223a3921b8c2b2359cdb5d205a11686380a66

and the program still shows:

 ./client
2024/03/24 07:38:55.836 [INF] NBIO Engine[NB] start with [3 eventloop, MaxOpenFiles: 1048576]
2024/03/24 07:38:55 OnOpen: 127.0.0.1:8888
2024/03/24 07:38:58 OnClose: 127.0.0.1:8888
running for 1 seconds, NumGoroutine: 4, success: 52647, totalSuccess: 52647
running for 2 seconds, NumGoroutine: 4, success: 0, totalSuccess: 52647
running for 3 seconds, NumGoroutine: 4, success: 0, totalSuccess: 52647
running for 4 seconds, NumGoroutine: 4, success: 0, totalSuccess: 52647
running for 5 seconds, NumGoroutine: 4, success: 0, totalSuccess: 52647

need the ability to handle ddos. i do have certain protection / rate limit on my side but... can this be pushed further? i mean actually it shouldnt break right? i havent come across a tcp connection that can break this way though. i'm not sure about other implementation but instead of breaking, it should slow down or something. i'm not sure.

but i'm sure it shouldnt break. can u think of a better way to fix this? i just tried to do mutex over the iterations but it still breaks.

can u think of a way to fix this? client to connection break if there are great number of tlsConn.Write? please provide a workaround of sorts. thx

kolinfluence commented 6 months ago

@lesismal the code here that crashed. it's funny that when i do a single for loop run and commented out the "

if bytes.Equal(wbuf, data) {

section, it works. but if i just run the whole program in its entirety, it crashed.

package main

import (
        "bytes"
        "fmt"
        "log"
        "runtime"
        "sync/atomic"
        "time"

        "github.com/lesismal/nbio"
        "github.com/lesismal/nbio/extension/tls"
)

var (
        qps   int64 = 0
        total int64 = 0

        tlsConfig = &tls.Config{
                InsecureSkipVerify: true,
        }
)

func main() {
        var (
                wbuf = []byte("hello world")
                addr = "localhost:8888"
        )

        g := nbio.NewEngine(nbio.Config{})

        isClient := true
        g.OnOpen(tls.WrapOpen(tlsConfig, isClient, func(c *nbio.Conn, tlsConn *tls.Conn) {
                log.Println("OnOpen:", c.RemoteAddr().String())
                // tlsConn.Write(wbuf)
        }))
        g.OnClose(tls.WrapClose(func(c *nbio.Conn, tlsConn *tls.Conn, err error) {
                log.Println("OnClose:", c.RemoteAddr().String())
        }))
        g.OnData(tls.WrapData(func(c *nbio.Conn, tlsConn *tls.Conn, data []byte) {
                if bytes.Equal(wbuf, data) {
                        tlsConn.Write(wbuf)
                        atomic.AddInt64(&qps, 1)
                } else {
                        c.Close()
                }
        }))

        err := g.Start()
        if err != nil {
                fmt.Printf("Start failed: %v\n", err)
        }
        defer g.Stop()

        for i := 0; i < 1; i++ {
                func() {
                        // step 1: make a tls.Conn by tls.Dial
                        tlsConn, err := tls.Dial("tcp", addr, tlsConfig)
                        if err != nil {
                                log.Fatalf("Dial failed: %v\n", err)
                        }
                        // step 2:
                        // add tls.Conn.conn to gopher, and get the nbio.Conn. the new nbio.Conn is non-blocking
                        nbConn, err := nbio.NBConn(tlsConn.Conn())
                        if err != nil {
                                log.Fatalf("AddConn failed: %v\n", err)
                        }
                        // step 3: set tls.Conn and nbio.Conn to each other, and add nbio.Conn to the gopher
                        isNonblock := true
                        nbConn.SetSession(tlsConn)
                        tlsConn.ResetConn(nbConn, isNonblock)
                        g.AddConn(nbConn)

                        // step 4: write data here or in the OnOpen handler or anywhere
for z:=0;z<10000000;z++ { //uncomment this and it will not work... why?
                                tlsConn.Write(wbuf)
}
                }()
        }

        ticker := time.NewTicker(time.Second)
        for i := 1; true; i++ {
                <-ticker.C
                nSuccess := atomic.SwapInt64(&qps, 0)
                total += nSuccess
                fmt.Printf("running for %v seconds, NumGoroutine: %v, success: %v, totalSuccess: %v\n", i, runtime.NumGoroutine(), nSuccess, total)
        }
}
lesismal commented 6 months ago

try this commit:

go get -u github.com/lesismal/nbio@a768b89

I optimized nbio.Conn write cache to reduce the num of small buffers cached by nbio.Conn and make it faster when calling writev.

kolinfluence commented 6 months ago

@lesismal updated. tested, its still not working.

go get -u github.com/lesismal/nbio@a768b89
go: downloading github.com/lesismal/nbio v1.5.3-0.20240324142751-a768b89f838c
go: upgraded github.com/lesismal/nbio v1.5.3-0.20240324111656-575223a3921b => v1.5.3-0.20240324142751-a768b89f838c
(base) root@ubuntu:/home/ubuntu/nbio-examples/tls/client# vi client.go 
(base) root@ubuntu:/home/ubuntu/nbio-examples/tls/client# go build client.go 
(base) root@ubuntu:/home/ubuntu/nbio-examples/tls/client# ./client
2024/03/24 10:42:59.488 [INF] NBIO Engine[NB] start with [3 eventloop, MaxOpenFiles: 1048576]
2024/03/24 10:42:59 OnOpen: 127.0.0.1:35964
2024/03/24 10:42:59 OnOpen: 127.0.0.1:8888
2024/03/24 10:43:01 OnClose: 127.0.0.1:35964
2024/03/24 10:43:01 OnClose: 127.0.0.1:8888
running for 1 seconds, NumGoroutine: 4, success: 54840, totalSuccess: 54840
running for 2 seconds, NumGoroutine: 4, success: 0, totalSuccess: 54840
running for 3 seconds, NumGoroutine: 4, success: 0, totalSuccess: 54840
running for 4 seconds, NumGoroutine: 4, success: 0, totalSuccess: 54840
lesismal commented 6 months ago

Did you updated nbio version for both server and client? Please make sure both the server and client using that commit version.

kolinfluence commented 6 months ago

@lesismal it's working now. thx. can u do a page and ask for people to submit use case as case studies? just curious who will be using it. i still need to test and implement it further.

is this used in production?

lesismal commented 6 months ago

for projuct using nbio, can write it here: https://github.com/lesismal/nbio/issues/128

is this used in production?

yes, in production. but your issue is the first one who write so much data, haha

kolinfluence commented 6 months ago

it puzzles me why people use non standard lib if it's not using extreme data etc.

anyway, i have one last question,

success: 0, 

under "high" cpu load, using the standard example of tls, the connection is closed. can you debug this? it seems like a bug to me. otherwise, can you provide update for both server and client to respond to the "dropped" / closed connection?

i've tested it and everything is great except this last bit. it's not very assuring if used in production and connection dropped this way. i am wondering how to ensure connectivity or reconnection. please show example of how u would remedy this and think about how to ensure connectivity resilience in future.

thx in advance. this should be the last area. it's really great work

p.s. : it is still dropping connection sometimes. i think somehow the buf is overwritten or something i'm not sure, it still discon at the byte.Equal area.

kolinfluence commented 6 months ago

i just checked the production list and the company's site using nbio is not functional anymore, network crashed due to high load?

https://github.com/lesismal/nbio/issues/128 https://dungeonmasters.app/

anyway, u can start a new issue page and link that to the readme so as to drive adoption. hope to see this as mainstream for websocket etc.

lesismal commented 6 months ago

i just checked the production list and the company's site using nbio is not functional anymore, network crashed due to high load?

I think it's just because nicoroy2561's game app stopped maintenance. It's not because of performance.

anyway, u can start a new issue page and link that to the readme so as to drive adoption. hope to see this as mainstream for websocket etc.

Good idea, thank you for your opinion very much! But, because I developed nbio late and it is not famous in the community, so there are not many users sharing their usage. Years ago when golang began to be popular, gorilla and some other repos that developed very early, became famous very easily because they were the first few that could be used. Respect to these early frameworks! Some other repos also got a lot of stars, even if they are not full features and not easy to use, because they are early trying, and they inspired us to go further. But for now, for nbio, it's not easy to get attention from the community, because most of the early frameworks can satisfy most of the users' needs, the users don't need to change then they don't want to change, even though nbio do a lot more than the early frameworks.

We can see some users here: https://github.com/lesismal/nbio/network/dependents Some of them are in production, some are not. And some more projects are developed in private repos.

kolinfluence commented 6 months ago

@lesismal pls provide solution for this:

success: 0, 

u can run it in multiple client to simulate this issue, it will break connection. after a while under load or something. sometimes doesnt even start. will follow up on this issue tml. thx

lesismal commented 6 months ago

I ran 3 clients, it slow but works. please try to send data in a normal frequency, else you should consider to batch sending your data in your logic, don't push so many small buffers to a single connection in 1 big loop, it causes the TCP window traffic.

lesismal commented 6 months ago

btw, the default test using bytes.Equal is just for testing. the tcp/tls transport streaming data, if you write a lot, we may not recv the same size in each OnData callback and the qps may be failed. that bytes.Equal was used for myself to test small buffer echo test and check whether there was dirty mem problems. you can customize the code yourself to do tests for performance, such as the suitable buffer size, suitable sending frequency, etc.

lesismal commented 6 months ago

and, you customize the extension/tls, it's an extension but just an example, the implementation of nbhttp and nbhttp/websocket doesn't use it directly but implement features in the similar way.

lesismal commented 5 months ago

I wouldn't provide so much user function customizing help, I'd close this and some other issues.