pingcap / tiflow

This repo maintains DM (a data migration platform) and TiCDC (change data capture for TiDB)
Apache License 2.0
423 stars 282 forks source link

Both executor heartbeat sender goroutine and server master receiver goroutine hang #6490

Open amyangfei opened 2 years ago

amyangfei commented 2 years ago

What did you do?

run chaos test in https://github.com/CharlesCheung96/tiflow/actions/runs/2743964576

What did you expect to see?

No response

What did you see instead?

detail logs: chaos-base-logs.network-emulation-dataflow.zip

There are two problems

  1. executor hangs, in chaos-executor-0_goroutines.log
goroutine 474 [select, 11 minutes]:
google.golang.org/grpc/internal/transport.(*Stream).waitOnHeader(0xc001c125a0)
    google.golang.org/grpc@v1.46.2/internal/transport/transport.go:324 +0x7c
google.golang.org/grpc/internal/transport.(*Stream).RecvCompress(...)
    google.golang.org/grpc@v1.46.2/internal/transport/transport.go:339
google.golang.org/grpc.(*csAttempt).recvMsg(0xc001bdaa50, {0x377e000?, 0xc001c104e0}, 0x0?)
    google.golang.org/grpc@v1.46.2/stream.go:980 +0xbb
google.golang.org/grpc.(*clientStream).RecvMsg.func1(0x0?)
    google.golang.org/grpc@v1.46.2/stream.go:845 +0x25
google.golang.org/grpc.(*clientStream).withRetry(0xc001c12360, 0xc00010f8f8, 0xc00010f8c8)
    google.golang.org/grpc@v1.46.2/stream.go:709 +0xd3
google.golang.org/grpc.(*clientStream).RecvMsg(0xc001c12360, {0x377e000?, 0xc001c104e0?})
    google.golang.org/grpc@v1.46.2/stream.go:844 +0x11f
google.golang.org/grpc.invoke({0x415f7c8?, 0xc000cd6c00?}, {0x3966a4c?, 0x20?}, {0x37a6f20, 0xc001c104b0}, {0x377e000, 0xc001c104e0}, 0x0?, {0x0, ...})
    google.golang.org/grpc@v1.46.2/call.go:73 +0xd7
google.golang.org/grpc.(*ClientConn).Invoke(0xc0b04aaed7b7e3de?, {0x415f7c8?, 0xc000cd6c00?}, {0x3966a4c?, 0xc00010fb60?}, {0x37a6f20?, 0xc001c104b0?}, {0x377e000?, 0xc001c104e0?}, {0x0, ...})
    google.golang.org/grpc@v1.46.2/call.go:37 +0x265
github.com/pingcap/tiflow/engine/enginepb.(*masterClient).Heartbeat(0xc000b021b8, {0x415f7c8, 0xc000cd6c00}, 0xc00010fae8?, {0x0, 0x0, 0x0})
    github.com/pingcap/tiflow/engine/enginepb/master.pb.go:1496 +0xc9
github.com/pingcap/tiflow/engine/pkg/rpcutil.DoFailoverRPC[...]({0x415f7c8, 0xc000cd6c00}, 0x28, 0xc001c104b0?, 0x3ae15f8)
    github.com/pingcap/tiflow/engine/pkg/rpcutil/client.go:207 +0x29c
github.com/pingcap/tiflow/engine/client.(*MasterClientImpl).Heartbeat(0xc0b04ab3d7a95091?, {0x415f7c8?, 0xc000cd6c00?}, 0xc0b04aaef57ee940?, 0x9fdbb0f72?)
    github.com/pingcap/tiflow/engine/client/master_client.go:100 +0x3a
github.com/pingcap/tiflow/engine/executor.(*Server).keepHeartbeat(0xc00020e9c0, {0x415f7c8, 0xc000cd6c00})
    github.com/pingcap/tiflow/engine/executor/server.go:577 +0x367
github.com/pingcap/tiflow/engine/executor.(*Server).Run.func5()
    github.com/pingcap/tiflow/engine/executor/server.go:405 +0x25
golang.org/x/sync/errgroup.(*Group).Go.func1()
    golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:74 +0x64
created by golang.org/x/sync/errgroup.(*Group).Go
    golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:71 +0xa5
  1. server master hangs, in chaos-server-master-2_goroutines.log
goroutine 12931 [semacquire, 11 minutes]:
sync.runtime_SemacquireMutex(0x0?, 0x1?, 0xffffffffffffffff?)
    runtime/sema.go:71 +0x25
sync.(*Mutex).lockSlow(0xc0001d4018)
    sync/mutex.go:162 +0x165
sync.(*Mutex).Lock(...)
    sync/mutex.go:81
github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).checkAliveImpl(0xc0001d4000)
    github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:283 +0x52
github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).Start.func1()
    github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:265 +0x148
created by github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).Start
    github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:255 +0xaa

goroutine 12932 [semacquire, 11 minutes]:
sync.runtime_SemacquireMutex(0xc00434fd40?, 0x1?, 0x1?)
    runtime/sema.go:71 +0x25
sync.(*Mutex).lockSlow(0xc0001d4018)
    sync/mutex.go:162 +0x165
sync.(*Mutex).Lock(...)
    sync/mutex.go:81
github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).HandleHeartbeat(0xc0001d4000, 0xc00436db90)
    github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:120 +0x152
github.com/pingcap/tiflow/engine/servermaster.(*Server).Heartbeat(0xc0001a5500, {0x415f870, 0xc00436db60}, 0xc00436dc20?)
    github.com/pingcap/tiflow/engine/servermaster/server.go:228 +0x194
github.com/pingcap/tiflow/engine/enginepb._Master_Heartbeat_Handler.func1({0x415f870, 0xc00436db60}, {0x37a6f20?, 0xc00436db90})
    github.com/pingcap/tiflow/engine/enginepb/master.pb.go:1725 +0x78
github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).UnaryServerInterceptor.func1({0x415f870, 0xc00436db60}, {0x37a6f20, 0xc00436db90}, 0xc0030b1f20?, 0xc004315f68)
    github.com/grpc-ecosystem/go-grpc-prometheus@v1.2.0/server_metrics.go:107 +0x87
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x415f870?, 0xc00436db60?}, {0x37a6f20?, 0xc00436db90?})
    github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:25 +0x3a
go.etcd.io/etcd/server/v3/etcdserver/api/v3rpc.newUnaryInterceptor.func1({0x415f870, 0xc00436db60}, {0x37a6f20?, 0xc00436db90}, 0xc003136900?, 0xc00434cca0)
    go.etcd.io/etcd/server/v3@v3.5.2/etcdserver/api/v3rpc/interceptor.go:71 +0x270
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x415f870?, 0xc00436db60?}, {0x37a6f20?, 0xc00436db90?})
    github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:25 +0x3a
go.etcd.io/etcd/server/v3/etcdserver/api/v3rpc.newLogUnaryInterceptor.func1({0x415f870, 0xc00436db60}, {0x37a6f20, 0xc00436db90}, 0xc00434cc80, 0xc00434ccc0)
    go.etcd.io/etcd/server/v3@v3.5.2/etcdserver/api/v3rpc/interceptor.go:78 +0xc2
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x415f870?, 0xc00436db60?}, {0x37a6f20?, 0xc00436db90?})
    github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:25 +0x3a
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1({0x415f870, 0xc00436db60}, {0x37a6f20, 0xc00436db90}, 0xc003136af0?, 0x3361f40?)
    github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:34 +0xbf
github.com/pingcap/tiflow/engine/enginepb._Master_Heartbeat_Handler({0x3834e20?, 0xc0001a5500}, {0x415f870, 0xc00436db60}, 0xc00440a060, 0xc00280c090)
    github.com/pingcap/tiflow/engine/enginepb/master.pb.go:1727 +0x138
google.golang.org/grpc.(*Server).processUnaryRPC(0xc00044c380, {0x4180d40, 0xc0005829c0}, 0xc00435d440, 0xc002818090, 0x6ecbcd8, 0x0)
    google.golang.org/grpc@v1.46.2/server.go:1283 +0xcfd
google.golang.org/grpc.(*Server).handleStream(0xc00044c380, {0x4180d40, 0xc0005829c0}, 0xc00435d440, 0x0)
    google.golang.org/grpc@v1.46.2/server.go:1620 +0xa1b
google.golang.org/grpc.(*Server).serveStreams.func1.2()
    google.golang.org/grpc@v1.46.2/server.go:922 +0x98
created by google.golang.org/grpc.(*Server).serveStreams.func1
    google.golang.org/grpc@v1.46.2/server.go:920 +0x28a

maybe caused by notifier, which holds the lock

goroutine 12912 [runnable]:
github.com/pingcap/tiflow/engine/pkg/notifier.(*Notifier[...]).Flush(0xc0000b4660, {0x415f7c8, 0xc00434fb80})
    github.com/pingcap/tiflow/engine/pkg/notifier/notifier.go:137 +0x85
github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).WatchExecutors(0xc0001d4000, {0x415f7c8, 0xc00434fb80})
    github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:336 +0x1ed
github.com/pingcap/tiflow/engine/pkg/externalresource/manager.(*DefaultGCCoordinator).initializeGC(0xc00434fb40, {0x415f7c8, 0xc00434fb80})
    github.com/pingcap/tiflow/engine/pkg/externalresource/manager/gc_coordinator.go:143 +0x6f
github.com/pingcap/tiflow/engine/pkg/externalresource/manager.(*DefaultGCCoordinator).Run(0xc003ae6f80?, {0x415f7c8, 0xc00434fb80})
    github.com/pingcap/tiflow/engine/pkg/externalresource/manager/gc_coordinator.go:78 +0xc7
github.com/pingcap/tiflow/engine/servermaster.(*Server).runLeaderService.func11()
    github.com/pingcap/tiflow/engine/servermaster/server.go:765 +0x34
golang.org/x/sync/errgroup.(*Group).Go.func1()
    golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:74 +0x64
created by golang.org/x/sync/errgroup.(*Group).Go
    golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:71 +0xa5

Versions of the cluster

Dataflow Engine version (run tiflow version):

master@https://github.com/pingcap/tiflow/commit/0c545a511fd9ef6264b13a7f1366ed7edec5a447

pr branch@https://github.com/CharlesCheung96/tiflow/commit/2ea08b94bc3d0937cdbfaf95dfeae17cb0eb39d5

CharlesCheung96 commented 2 years ago

/assign