grpc / grpc-go

The Go language implementation of gRPC. HTTP/2 based RPC
https://grpc.io
Apache License 2.0
21.07k stars 4.38k forks source link

Flaky test: Test/ClientCloseReturnsEarlyWhenGoAwayWriteHangs #7606

Closed arjan-bal closed 4 weeks ago

arjan-bal commented 1 month ago

Sample failure: https://github.com/grpc/grpc-go/actions/runs/10780783629/job/29897278416?pr=7498

Logs

panic: test timed out after 7m0s
running tests:
    Test (6m40s)
    Test/ClientCloseReturnsEarlyWhenGoAwayWriteHangs (6m38s)

goroutine 2385 [running]:
testing.(*M).startAlarm.func1()
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2366 +0x265
created by time.goFunc
    /opt/hostedtoolcache/go/1.22.6/x64/src/time/sleep.go:177 +0x45

goroutine 1 [chan receive, 6 minutes]:
testing.(*T).Run(0xc000162340, {0xc548bb, 0x4}, 0xc929a0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1750 +0x851
testing.runTests.func1(0xc000162340)
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2161 +0x86
testing.tRunner(0xc000162340, 0xc000e61b10)
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1689 +0x21f
testing.runTests(0xc00019e228, {0x10ea020, 0x4, 0x4}, {0xc00026dbb8?, 0xc00026dc00?, 0x10f3200?})
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2159 +0x8bf
testing.(*M).Run(0xc0001a4fa0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2027 +0xf18
main.main()
    _testmain.go:57 +0x2be

goroutine 2285 [chan receive, 6 minutes]:
testing.(*T).Run(0xc0001624e0, {0xb8ebc2, 0x2b}, 0xc0000444a0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1750 +0x851
google.golang.org/grpc/internal/grpctest.RunSubTests(0xc0001624e0, {0xd4aba0, 0x1154000})
    /home/runner/work/grpc-go/grpc-go/internal/grpctest/grpctest.go:114 +0x3d8
google.golang.org/grpc/internal/transport.Test(0xc0001624e0)
    /home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:60 +0x35
testing.tRunner(0xc0001624e0, 0xc929a0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1689 +0x21f
created by testing.(*T).Run in goroutine 1
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1742 +0x826

goroutine 2367 [sync.Mutex.Lock, 6 minutes]:
sync.runtime_SemacquireMutex(0x72ca2d?, 0x0?, 0xc00020aa00?)
    /opt/hostedtoolcache/go/1.22.6/x64/src/runtime/sema.go:77 +0x25
sync.(*Mutex).lockSlow(0xc000202870)
    /opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:171 +0x213
sync.(*Mutex).Lock(0xc000202870)
    /opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:90 +0x55
google.golang.org/grpc/internal/transport.(*http2Client).GetGoAwayReason(0xc0002026c8)
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:1418 +0x55
google.golang.org/grpc/internal/transport.(*http2Client).Close(0xc0002026c8, {0xd49120, 0xc00003a9f0})
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:1031 +0x7b2
google.golang.org/grpc/internal/transport.s.TestClientCloseReturnsEarlyWhenGoAwayWriteHangs({{}}, 0xc0001e0680)
    /home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:2846 +0x7e7
google.golang.org/grpc/internal/grpctest.RunSubTests.func1(0xc0001e0680)
    /home/runner/work/grpc-go/grpc-go/internal/grpctest/grpctest.go:122 +0x10e
testing.tRunner(0xc0001e0680, 0xc0000444a0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1689 +0x21f
created by testing.(*T).Run in goroutine 2285
    /opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1742 +0x826

goroutine 2368 [IO wait, 6 minutes]:
internal/poll.runtime_pollWait(0x7fe83f328c80, 0x72)
    /opt/hostedtoolcache/go/1.22.6/x64/src/runtime/netpoll.go:345 +0x85
internal/poll.(*pollDesc).wait(0xc00020a120, 0x72, 0x0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/internal/poll/fd_poll_runtime.go:84 +0xb1
internal/poll.(*pollDesc).waitRead(...)
    /opt/hostedtoolcache/go/1.22.6/x64/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0xc00020a100)
    /opt/hostedtoolcache/go/1.22.6/x64/src/internal/poll/fd_unix.go:611 +0x507
net.(*netFD).accept(0xc00020a100)
    /opt/hostedtoolcache/go/1.22.6/x64/src/net/fd_unix.go:172 +0x45
net.(*TCPListener).accept(0xc0000446c0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/net/tcpsock_posix.go:159 +0x3e
net.(*TCPListener).Accept(0xc0000446c0)
    /opt/hostedtoolcache/go/1.22.6/x64/src/net/tcpsock.go:327 +0x65
google.golang.org/grpc/internal/transport.(*server).start(0xc0000242d0, 0xc0001e0680, 0x0, 0xc000000[180](https://github.com/grpc/grpc-go/actions/runs/10780783629/job/29897278416?pr=7498#step:8:181), 0x0)
    /home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:362 +0x522
created by google.golang.org/grpc/internal/transport.setUpServerOnly in goroutine 2367
    /home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:453 +0x325

goroutine 2392 [sync.Mutex.Lock, 6 minutes]:
sync.runtime_SemacquireMutex(0x4b6229?, 0x78?, 0xb66c05?)
    /opt/hostedtoolcache/go/1.22.6/x64/src/runtime/sema.go:77 +0x25
sync.(*Mutex).lockSlow(0xc000202870)
    /opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:171 +0x213
sync.(*Mutex).Lock(0xc000202870)
    /opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:90 +0x55
google.golang.org/grpc/internal/transport.(*http2Client).Close(0xc0002026c8, {0xd491c0, 0xc0000b8de0})
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:989 +0xe5
google.golang.org/grpc/internal/transport.(*http2Client).reader(0xc0002026c8, 0xc000408f00)
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:1672 +0xcf5
created by google.golang.org/grpc/internal/transport.newHTTP2Client in goroutine 2367
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:412 +0x33ab

goroutine 2393 [chan receive, 6 minutes]:
google.golang.org/grpc/internal/transport.(*hangingConn).Write(0xc000044780, {0xc000124040, 0x2a, 0x40})
    /home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:2796 +0xb5
google.golang.org/grpc/internal/transport.(*bufWriter).Write(0xc000024aa0, {0xc000124040, 0x2a, 0x40})
    /home/runner/work/grpc-go/grpc-go/internal/transport/http_util.go:325 +0x1da
golang.org/x/net/http2.(*Framer).endWrite(0xc00015e540)
    /home/runner/go/pkg/mod/golang.org/x/net@v0.28.0/http2/frame.go:371 +0x19d
golang.org/x/net/http2.(*Framer).WriteGoAway(0xc00015e540, 0x1, 0x0, {0xc0001d4380, 0x19, 0x47928e?})
    /home/runner/go/pkg/mod/golang.org/x/net@v0.28.0/http2/frame.go:924 +0x599
google.golang.org/grpc/internal/transport.(*http2Client).outgoingGoAwayHandler(0xc000[202](https://github.com/grpc/grpc-go/actions/runs/10780783629/job/29897278416?pr=7498#step:8:203)6c8, 0xc00017e440)
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:531 +0x177
google.golang.org/grpc/internal/transport.(*loopyWriter).goAwayHandler(0xc00024c180, 0xc00017e440)
    /home/runner/work/grpc-go/grpc-go/internal/transport/controlbuf.go:852 +0x67
google.golang.org/grpc/internal/transport.(*loopyWriter).handle(0xc00024c180, {0xbbf300, 0xc00017e440})
    /home/runner/work/grpc-go/grpc-go/internal/transport/controlbuf.go:886 +0x7dc
google.golang.org/grpc/internal/transport.(*loopyWriter).run(0xc00024c180)
    /home/runner/work/grpc-go/grpc-go/internal/transport/controlbuf.go:579 +0x17c
google.golang.org/grpc/internal/transport.newHTTP2Client.func6()
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:470 +0x250
created by google.golang.org/grpc/internal/transport.newHTTP2Client in goroutine 2367
    /home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:468 +0x3df4
FAIL    google.golang.org/grpc/internal/transport   420.030s
purnesh42H commented 1 month ago

should area be transport?

aranjans commented 1 month ago

yes, this should be transport. Updated.

eshitachandwani commented 1 month ago

https://github.com/grpc/grpc-go/actions/runs/10980130364/job/30485516546?pr=7658

aranjans commented 1 month ago

RCA: https://github.com/grpc/grpc-go/pull/7371 PR introduced a timeout logic to loopyWriter to exit but t.Close uses GetGoAwayReason to fetch the last goAway and it obtains t.mu.Lock(). Test/ClientCloseReturnsEarlyWhenGoAwayWriteHangs overrides the http/2 Write method and blocks forever, so in case t.Close() is called and we sent out goAwayFrame here and this gets executed before we write the goAway frame over the wire here, this forms a deadlock. Hence, in order to fix it we can fetch the goAwayReason first and then put the goAway frame into cbuf so to avoid such deadlock conditions.

We have this PR in review for the same.

easwars commented 1 month ago

https://github.com/grpc/grpc-go/actions/runs/11149119865/job/30987267909?pr=7695