Open amyangfei opened 2 years ago
run chaos test in https://github.com/CharlesCheung96/tiflow/actions/runs/2743964576
No response
detail logs: chaos-base-logs.network-emulation-dataflow.zip
There are two problems
chaos-executor-0_goroutines.log
goroutine 474 [select, 11 minutes]: google.golang.org/grpc/internal/transport.(*Stream).waitOnHeader(0xc001c125a0) google.golang.org/grpc@v1.46.2/internal/transport/transport.go:324 +0x7c google.golang.org/grpc/internal/transport.(*Stream).RecvCompress(...) google.golang.org/grpc@v1.46.2/internal/transport/transport.go:339 google.golang.org/grpc.(*csAttempt).recvMsg(0xc001bdaa50, {0x377e000?, 0xc001c104e0}, 0x0?) google.golang.org/grpc@v1.46.2/stream.go:980 +0xbb google.golang.org/grpc.(*clientStream).RecvMsg.func1(0x0?) google.golang.org/grpc@v1.46.2/stream.go:845 +0x25 google.golang.org/grpc.(*clientStream).withRetry(0xc001c12360, 0xc00010f8f8, 0xc00010f8c8) google.golang.org/grpc@v1.46.2/stream.go:709 +0xd3 google.golang.org/grpc.(*clientStream).RecvMsg(0xc001c12360, {0x377e000?, 0xc001c104e0?}) google.golang.org/grpc@v1.46.2/stream.go:844 +0x11f google.golang.org/grpc.invoke({0x415f7c8?, 0xc000cd6c00?}, {0x3966a4c?, 0x20?}, {0x37a6f20, 0xc001c104b0}, {0x377e000, 0xc001c104e0}, 0x0?, {0x0, ...}) google.golang.org/grpc@v1.46.2/call.go:73 +0xd7 google.golang.org/grpc.(*ClientConn).Invoke(0xc0b04aaed7b7e3de?, {0x415f7c8?, 0xc000cd6c00?}, {0x3966a4c?, 0xc00010fb60?}, {0x37a6f20?, 0xc001c104b0?}, {0x377e000?, 0xc001c104e0?}, {0x0, ...}) google.golang.org/grpc@v1.46.2/call.go:37 +0x265 github.com/pingcap/tiflow/engine/enginepb.(*masterClient).Heartbeat(0xc000b021b8, {0x415f7c8, 0xc000cd6c00}, 0xc00010fae8?, {0x0, 0x0, 0x0}) github.com/pingcap/tiflow/engine/enginepb/master.pb.go:1496 +0xc9 github.com/pingcap/tiflow/engine/pkg/rpcutil.DoFailoverRPC[...]({0x415f7c8, 0xc000cd6c00}, 0x28, 0xc001c104b0?, 0x3ae15f8) github.com/pingcap/tiflow/engine/pkg/rpcutil/client.go:207 +0x29c github.com/pingcap/tiflow/engine/client.(*MasterClientImpl).Heartbeat(0xc0b04ab3d7a95091?, {0x415f7c8?, 0xc000cd6c00?}, 0xc0b04aaef57ee940?, 0x9fdbb0f72?) github.com/pingcap/tiflow/engine/client/master_client.go:100 +0x3a github.com/pingcap/tiflow/engine/executor.(*Server).keepHeartbeat(0xc00020e9c0, {0x415f7c8, 0xc000cd6c00}) github.com/pingcap/tiflow/engine/executor/server.go:577 +0x367 github.com/pingcap/tiflow/engine/executor.(*Server).Run.func5() github.com/pingcap/tiflow/engine/executor/server.go:405 +0x25 golang.org/x/sync/errgroup.(*Group).Go.func1() golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:74 +0x64 created by golang.org/x/sync/errgroup.(*Group).Go golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:71 +0xa5
chaos-server-master-2_goroutines.log
goroutine 12931 [semacquire, 11 minutes]: sync.runtime_SemacquireMutex(0x0?, 0x1?, 0xffffffffffffffff?) runtime/sema.go:71 +0x25 sync.(*Mutex).lockSlow(0xc0001d4018) sync/mutex.go:162 +0x165 sync.(*Mutex).Lock(...) sync/mutex.go:81 github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).checkAliveImpl(0xc0001d4000) github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:283 +0x52 github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).Start.func1() github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:265 +0x148 created by github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).Start github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:255 +0xaa goroutine 12932 [semacquire, 11 minutes]: sync.runtime_SemacquireMutex(0xc00434fd40?, 0x1?, 0x1?) runtime/sema.go:71 +0x25 sync.(*Mutex).lockSlow(0xc0001d4018) sync/mutex.go:162 +0x165 sync.(*Mutex).Lock(...) sync/mutex.go:81 github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).HandleHeartbeat(0xc0001d4000, 0xc00436db90) github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:120 +0x152 github.com/pingcap/tiflow/engine/servermaster.(*Server).Heartbeat(0xc0001a5500, {0x415f870, 0xc00436db60}, 0xc00436dc20?) github.com/pingcap/tiflow/engine/servermaster/server.go:228 +0x194 github.com/pingcap/tiflow/engine/enginepb._Master_Heartbeat_Handler.func1({0x415f870, 0xc00436db60}, {0x37a6f20?, 0xc00436db90}) github.com/pingcap/tiflow/engine/enginepb/master.pb.go:1725 +0x78 github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).UnaryServerInterceptor.func1({0x415f870, 0xc00436db60}, {0x37a6f20, 0xc00436db90}, 0xc0030b1f20?, 0xc004315f68) github.com/grpc-ecosystem/go-grpc-prometheus@v1.2.0/server_metrics.go:107 +0x87 github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x415f870?, 0xc00436db60?}, {0x37a6f20?, 0xc00436db90?}) github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:25 +0x3a go.etcd.io/etcd/server/v3/etcdserver/api/v3rpc.newUnaryInterceptor.func1({0x415f870, 0xc00436db60}, {0x37a6f20?, 0xc00436db90}, 0xc003136900?, 0xc00434cca0) go.etcd.io/etcd/server/v3@v3.5.2/etcdserver/api/v3rpc/interceptor.go:71 +0x270 github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x415f870?, 0xc00436db60?}, {0x37a6f20?, 0xc00436db90?}) github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:25 +0x3a go.etcd.io/etcd/server/v3/etcdserver/api/v3rpc.newLogUnaryInterceptor.func1({0x415f870, 0xc00436db60}, {0x37a6f20, 0xc00436db90}, 0xc00434cc80, 0xc00434ccc0) go.etcd.io/etcd/server/v3@v3.5.2/etcdserver/api/v3rpc/interceptor.go:78 +0xc2 github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x415f870?, 0xc00436db60?}, {0x37a6f20?, 0xc00436db90?}) github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:25 +0x3a github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1({0x415f870, 0xc00436db60}, {0x37a6f20, 0xc00436db90}, 0xc003136af0?, 0x3361f40?) github.com/grpc-ecosystem/go-grpc-middleware@v1.3.0/chain.go:34 +0xbf github.com/pingcap/tiflow/engine/enginepb._Master_Heartbeat_Handler({0x3834e20?, 0xc0001a5500}, {0x415f870, 0xc00436db60}, 0xc00440a060, 0xc00280c090) github.com/pingcap/tiflow/engine/enginepb/master.pb.go:1727 +0x138 google.golang.org/grpc.(*Server).processUnaryRPC(0xc00044c380, {0x4180d40, 0xc0005829c0}, 0xc00435d440, 0xc002818090, 0x6ecbcd8, 0x0) google.golang.org/grpc@v1.46.2/server.go:1283 +0xcfd google.golang.org/grpc.(*Server).handleStream(0xc00044c380, {0x4180d40, 0xc0005829c0}, 0xc00435d440, 0x0) google.golang.org/grpc@v1.46.2/server.go:1620 +0xa1b google.golang.org/grpc.(*Server).serveStreams.func1.2() google.golang.org/grpc@v1.46.2/server.go:922 +0x98 created by google.golang.org/grpc.(*Server).serveStreams.func1 google.golang.org/grpc@v1.46.2/server.go:920 +0x28a
maybe caused by notifier, which holds the lock
goroutine 12912 [runnable]: github.com/pingcap/tiflow/engine/pkg/notifier.(*Notifier[...]).Flush(0xc0000b4660, {0x415f7c8, 0xc00434fb80}) github.com/pingcap/tiflow/engine/pkg/notifier/notifier.go:137 +0x85 github.com/pingcap/tiflow/engine/servermaster.(*ExecutorManagerImpl).WatchExecutors(0xc0001d4000, {0x415f7c8, 0xc00434fb80}) github.com/pingcap/tiflow/engine/servermaster/executor_manager.go:336 +0x1ed github.com/pingcap/tiflow/engine/pkg/externalresource/manager.(*DefaultGCCoordinator).initializeGC(0xc00434fb40, {0x415f7c8, 0xc00434fb80}) github.com/pingcap/tiflow/engine/pkg/externalresource/manager/gc_coordinator.go:143 +0x6f github.com/pingcap/tiflow/engine/pkg/externalresource/manager.(*DefaultGCCoordinator).Run(0xc003ae6f80?, {0x415f7c8, 0xc00434fb80}) github.com/pingcap/tiflow/engine/pkg/externalresource/manager/gc_coordinator.go:78 +0xc7 github.com/pingcap/tiflow/engine/servermaster.(*Server).runLeaderService.func11() github.com/pingcap/tiflow/engine/servermaster/server.go:765 +0x34 golang.org/x/sync/errgroup.(*Group).Go.func1() golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:74 +0x64 created by golang.org/x/sync/errgroup.(*Group).Go golang.org/x/sync@v0.0.0-20220513210516-0976fa681c29/errgroup/errgroup.go:71 +0xa5
Dataflow Engine version (run tiflow version):
tiflow version
master@https://github.com/pingcap/tiflow/commit/0c545a511fd9ef6264b13a7f1366ed7edec5a447
pr branch@https://github.com/CharlesCheung96/tiflow/commit/2ea08b94bc3d0937cdbfaf95dfeae17cb0eb39d5
/assign
What did you do?
run chaos test in https://github.com/CharlesCheung96/tiflow/actions/runs/2743964576
What did you expect to see?
No response
What did you see instead?
detail logs: chaos-base-logs.network-emulation-dataflow.zip
There are two problems
chaos-executor-0_goroutines.log
chaos-server-master-2_goroutines.log
maybe caused by notifier, which holds the lock
Versions of the cluster
Dataflow Engine version (run
tiflow version
):master@https://github.com/pingcap/tiflow/commit/0c545a511fd9ef6264b13a7f1366ed7edec5a447
pr branch@https://github.com/CharlesCheung96/tiflow/commit/2ea08b94bc3d0937cdbfaf95dfeae17cb0eb39d5