hanfei1991 / microcosm

a mini bench expreriment for a task runtime scheduler
8 stars 6 forks source link

server master panics when executor node restarts #357

Open amyangfei opened 2 years ago

amyangfei commented 2 years ago

Found in node failure test: https://github.com/hanfei1991/microcosm/pull/355

[2022/04/29 06:48:31.067 +00:00] [INFO] [master.go:296] ["Heartbeat Ping received"] [msg="{\"send-time\":1467104317866516,\"from-worker-id\":\"b2fb2bad-efdb-49a8-9150-b75b4bd44938\",\"epoch\":6}"] [master-id=dataflow-engine-job-manager]
[2022/04/29 06:48:31.829 +00:00] [INFO] [jobmanager.go:327] ["on worker status updated"] [worker-id=b2fb2bad-efdb-49a8-9150-b75b4bd44938] [status="{\"SeqID\":0,\"CreatedAt\":\"0001-01-01T00:00:00Z\",\"UpdatedAt\":\"0001-01-01T00:00:00Z\",\"ProjectID\":\"\",\"JobID\":\"\",\"ID\":\"\",\"Type\":0,\"code\":1,\"error-message\":\"\",\"ext-bytes\":\"eyIwOGQ3NWVkNC0yOGFkLTQyODAtYTViYi1jNjI3OTFhOWYzMTkiOnsiYnVzaW5lc3MtaWQiOjEsInRpY2siOjMwNCwiZXRjZC1jaGVja3BvaW50Ijp7InJldmlzaW9uIjoyMCwibXZjYyI6MSwidmFsdWUiOiJyYW5kb20tdmFsdWUtMSJ9fSwiNmUzNjI5MmMtN2ZiNy00NmMyLWFkOGYtZjVmMzJlZDMwYTY0Ijp7ImJ1c2luZXNzLWlkIjoyLCJ0aWNrIjozMDMsImV0Y2QtY2hlY2twb2ludCI6eyJyZXZpc2lvbiI6MjEsIm12Y2MiOjEsInZhbHVlIjoicmFuZG9tLXZhbHVlLTEifX0sImIxOWU4MGQ2LWFlZGItNDQ5NC04ZWQ5LWQ4ZDM1Nzc4OTY1OCI6eyJidXNpbmVzcy1pZCI6MCwidGljayI6MzA0LCJldGNkLWNoZWNrcG9pbnQiOnsicmV2aXNpb24iOjE5LCJtdmNjIjoxLCJ2YWx1ZSI6InJhbmRvbS12YWx1ZS0xIn19LCJlNTc5MDI4MC1hZGM1LTRjOTUtOGI5Ni1hZWQyZGY0ZWViY2IiOnsiYnVzaW5lc3MtaWQiOjMsInRpY2siOjMwMywiZXRjZC1jaGVja3BvaW50Ijp7InJldmlzaW9uIjoyMiwibXZjYyI6MSwidmFsdWUiOiJyYW5kb20tdmFsdWUtMSJ9fX0=\"}"]
[2022/04/29 06:48:34.068 +00:00] [INFO] [master.go:296] ["Heartbeat Ping received"] [msg="{\"send-time\":1467107317919901,\"from-worker-id\":\"b2fb2bad-efdb-49a8-9150-b75b4bd44938\",\"epoch\":6}"] [master-id=dataflow-engine-job-manager]
[2022/04/29 06:48:34.829 +00:00] [INFO] [jobmanager.go:327] ["on worker status updated"] [worker-id=b2fb2bad-efdb-49a8-9150-b75b4bd44938] [status="{\"SeqID\":0,\"CreatedAt\":\"0001-01-01T00:00:00Z\",\"UpdatedAt\":\"0001-01-01T00:00:00Z\",\"ProjectID\":\"\",\"JobID\":\"\",\"ID\":\"\",\"Type\":0,\"code\":1,\"error-message\":\"\",\"ext-bytes\":\"eyIwOGQ3NWVkNC0yOGFkLTQyODAtYTViYi1jNjI3OTFhOWYzMTkiOnsiYnVzaW5lc3MtaWQiOjEsInRpY2siOjM2NSwiZXRjZC1jaGVja3BvaW50Ijp7InJldmlzaW9uIjozMCwibXZjYyI6MiwidmFsdWUiOiJyYW5kb20tdmFsdWUtMiJ9fSwiNmUzNjI5MmMtN2ZiNy00NmMyLWFkOGYtZjVmMzJlZDMwYTY0Ijp7ImJ1c2luZXNzLWlkIjoyLCJ0aWNrIjozNjQsImV0Y2QtY2hlY2twb2ludCI6eyJyZXZpc2lvbiI6MzEsIm12Y2MiOjIsInZhbHVlIjoicmFuZG9tLXZhbHVlLTIifX0sImIxOWU4MGQ2LWFlZGItNDQ5NC04ZWQ5LWQ4ZDM1Nzc4OTY1OCI6eyJidXNpbmVzcy1pZCI6MCwidGljayI6MzY1LCJldGNkLWNoZWNrcG9pbnQiOnsicmV2aXNpb24iOjI5LCJtdmNjIjoyLCJ2YWx1ZSI6InJhbmRvbS12YWx1ZS0yIn19LCJlNTc5MDI4MC1hZGM1LTRjOTUtOGI5Ni1hZWQyZGY0ZWViY2IiOnsiYnVzaW5lc3MtaWQiOjMsInRpY2siOjM2MywiZXRjZC1jaGVja3BvaW50Ijp7InJldmlzaW9uIjozMiwibXZjYyI6MiwidmFsdWUiOiJyYW5kb20tdmFsdWUtMiJ9fX0=\"}"]
[2022/04/29 06:48:46.011 +00:00] [INFO] [master.go:296] ["Heartbeat Ping received"] [msg="{\"send-time\":1467119317981211,\"from-worker-id\":\"b2fb2bad-efdb-49a8-9150-b75b4bd44938\",\"epoch\":44}"] [master-id=dataflow-engine-job-manager]
[2022/04/29 06:48:46.012 +00:00] [INFO] [worker_manager.go:227] ["Worker discovered"] [master-id=dataflow-engine-job-manager] [worker-entry="{worker-id:b2fb2bad-efdb-49a8-9150-b75b4bd44938, executor-id:, state:1}"]
[2022/04/29 06:48:49.006 +00:00] [INFO] [master.go:296] ["Heartbeat Ping received"] [msg="{\"send-time\":1467122317298721,\"from-worker-id\":\"b2fb2bad-efdb-49a8-9150-b75b4bd44938\",\"epoch\":44}"] [master-id=dataflow-engine-job-manager]
[2022/04/29 06:48:49.006 +00:00] [PANIC] [worker_manager.go:223] ["Unexpected worker entry state"] [entry="{worker-id:b2fb2bad-efdb-49a8-9150-b75b4bd44938, executor-id:ed48e2ad-346b-4343-8710-e872c3e5954d, state:3}"] [stack="github.com/hanfei1991/microcosm/lib/master.(*WorkerManager).HandleHeartbeat\n\t/dataflow-engine/lib/master/worker_manager.go:223\ngithub.com/hanfei1991/microcosm/lib.(*DefaultBaseMaster).registerMessageHandlers.func1\n\t/dataflow-engine/lib/master.go:316\ngithub.com/pingcap/tiflow/pkg/p2p.(*MessageServer).AddHandler.func1\n\t/go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/p2p/server.go:431\ngithub.com/pingcap/tiflow/pkg/workerpool.(*defaultEventHandle).AddEvent.func2\n\t/go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/workerpool/pool_impl.go:140\ngithub.com/pingcap/tiflow/pkg/workerpool.(*worker).run\n\t/go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/workerpool/pool_impl.go:348\ngithub.com/pingcap/tiflow/pkg/workerpool.(*defaultPoolImpl).Run.func1\n\t/go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/workerpool/pool_impl.go:67\ngolang.org/x/sync/errgroup.(*Group).Go.func1\n\t/go/pkg/mod/golang.org/x/sync@v0.0.0-20210220032951-036812b2e83c/errgroup/errgroup.go:57"]
panic: Unexpected worker entry state

goroutine 473 [running]:
go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc00077a6c0, {0xc000647fc0, 0x1, 0x1})
        /go/pkg/mod/go.uber.org/zap@v1.21.0/zapcore/entry.go:232 +0x44c
go.uber.org/zap.(*Logger).Panic(0x1e0d2a8?, {0x1e4c644?, 0x1ca25a0?}, {0xc000647fc0, 0x1, 0x1})
        /go/pkg/mod/go.uber.org/zap@v1.21.0/logger.go:230 +0x59
github.com/hanfei1991/microcosm/lib/master.(*WorkerManager).HandleHeartbeat(0xc001ff8680, 0xc000dcf760, {0xc001e363f0, 0x24})
        /dataflow-engine/lib/master/worker_manager.go:223 +0x2a5
github.com/hanfei1991/microcosm/lib.(*DefaultBaseMaster).registerMessageHandlers.func1({0xc001e363f0, 0x24}, {0x1a25c40?, 0xc000dcf760})
        /dataflow-engine/lib/master.go:316 +0x486
github.com/pingcap/tiflow/pkg/p2p.(*MessageServer).AddHandler.func1({0xc0011cc6c0?, 0x4118a8?}, {0x1bce640?, 0xc000cb6bd0?})
        /go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/p2p/server.go:431 +0x222
github.com/pingcap/tiflow/pkg/workerpool.(*defaultEventHandle).AddEvent.func2({0xc000ee3e98?, 0xc000ee3d68?})
        /go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/workerpool/pool_impl.go:140 +0x30
github.com/pingcap/tiflow/pkg/workerpool.(*worker).run(0xc000c61710, {0x229fdf0, 0xc0011d81c0?})
        /go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/workerpool/pool_impl.go:348 +0x3e2
github.com/pingcap/tiflow/pkg/workerpool.(*defaultPoolImpl).Run.func1()
        /go/pkg/mod/github.com/pingcap/tiflow@v0.0.0-20220418100802-8c4f693f6456/pkg/workerpool/pool_impl.go:67 +0x28
golang.org/x/sync/errgroup.(*Group).Go.func1()
        /go/pkg/mod/golang.org/x/sync@v0.0.0-20210220032951-036812b2e83c/errgroup/errgroup.go:57 +0x67
created by golang.org/x/sync/errgroup.(*Group).Go
        /go/pkg/mod/golang.org/x/sync@v0.0.0-20210220032951-036812b2e83c/errgroup/errgroup.go:54 +0x8d

detail log: server_master_1.log

amyangfei commented 2 years ago

master3.log

[2022/05/05 18:21:42.457 +08:00] [INFO] [server.go:257] ["register executor"] [resp="err:<code:MasterNotReady > "] []
[2022/05/05 18:21:45.459 +08:00] [INFO] [server.go:140] [payload="address:\"127.0.0.1:10241\" capability:100 "] [request=RegisterExecutor]
[2022/05/05 18:21:45.459 +08:00] [INFO] [server.go:257] ["register executor"] [resp="err:<code:MasterNotReady > "] []
[2022/05/05 18:21:47.101 +08:00] [INFO] [executor_manager.go:196] ["check alive"] [exec=03a1c738-43b0-41a4-9e95-609403f8035c]
[2022/05/05 18:21:48.462 +08:00] [INFO] [server.go:257] ["register executor"] [resp="err:<code:MasterNotReady > "] []
[2022/05/05 18:21:51.000 +08:00] [INFO] [member.go:95] ["update server master members"] [members="[{\"is-serv-leader\":true,\"is-etcd-leader\":true,\"name\":\"server-master-3-ed025111-4c90-42eb-bdfd-41d56f9129e5\",\"advertise-addr\":\"127.0.0.1:10343\"},{\"is-serv-leader\":false,\"is-etcd-leader\":false,\"name\":\"server-master-1-4f10de3c-edbf-4258-be64-09113e2f4e5e\",\"advertise-addr\":\"127.0.0.1:10341\"},{\"is-serv-leader\":false,\"is-etcd-leader\":false,\"name\":\"server-master-2-d970b9bc-01c6-4c84-828e-fd9c56c9de10\",\"advertise-addr\":\"127.0.0.1:10342\"}]"]
[2022/05/05 18:21:51.104 +08:00] [INFO] [executor_manager.go:69] ["begin to remove executor"] [id=03a1c738-43b0-41a4-9e95-609403f8035c]
[2022/05/05 18:21:51.104 +08:00] [INFO] [capacity_impl.go:50] ["executor resource is unregistered"] [executor-id=03a1c738-43b0-41a4-9e95-609403f8035c]
[2022/05/05 18:21:51.104 +08:00] [INFO] [executor_manager.go:81] ["notify to offline exec"]
[2022/05/05 18:21:51.109 +08:00] [INFO] [jobmanager.go:279] ["recover job, move it to WaitAck job queue"] [job="{\"seq-id\":0,\"created-at\":\"0001-01-01T00:00:00Z\",\"updated-at\":\"0001-01-01T00:00:00Z\",\"project-id\":\"\",\"id\":\"849be699-1bd8-4a01-a2f0-b95a39d3d1b6\",\"type\":3,\"status\":2,\"node-id\":\"03a1c738-43b0-41a4-9e95-609403f8035c\",\"addr\":\"127.0.0.1:10241\",\"epoch\":1766,\"config\":\"ewogICAgImpvYi1uYW1lIjogInRlc3QtZmFrZS1qb2ItMiIsCiAgICAid29ya2VyLWNvdW50IjogMywKICAgICJ0YXJnZXQtdGljayI6IDYwMDAwLAogICAgImV0Y2Qtd2F0Y2gtZW5hYmxlIjogdHJ1ZSwKICAgICJldGNkLWVuZHBvaW50cyI6IFsiMTI3LjAuMC4xOjEyNDc5Il0sCiAgICAiZXRjZC13YXRjaC1wcmVmaXgiOiAiL2Zha2UtZGVtby90ZXN0LTIvIgp9Cg==\"}"]
[2022/05/05 18:21:51.473 +08:00] [INFO] [server.go:140] [payload="address:\"127.0.0.1:10241\" capability:100 "] [request=RegisterExecutor]
[2022/05/05 18:21:51.473 +08:00] [INFO] [server.go:257] ["register executor"] [resp=] []
[2022/05/05 18:21:51.473 +08:00] [INFO] [executor_manager.go:147] ["allocate new executor"] [req="address:\"127.0.0.1:10241\" capability:100 "]
[2022/05/05 18:21:51.473 +08:00] [INFO] [executor_manager.go:130] ["register executor"] [info="{\"type\":0,\"id\":\"562a8719-8466-408a-b0c9-54d4320459ae\",\"addr\":\"127.0.0.1:10241\",\"cap\":100}"]
[2022/05/05 18:21:51.473 +08:00] [INFO] [capacity_impl.go:41] ["executor resource is registered"] [executor-id=562a8719-8466-408a-b0c9-54d4320459ae] [capacity=100]
[2022/05/05 18:21:51.601 +08:00] [INFO] [executor_manager.go:196] ["check alive"] [exec=562a8719-8466-408a-b0c9-54d4320459ae]
[2022/05/05 18:21:51.719 +08:00] [INFO] [master.go:499] [CreateWorker] [worker-type=3] [worker-config="{\"seq-id\":0,\"created-at\":\"0001-01-01T00:00:00Z\",\"updated-at\":\"0001-01-01T00:00:00Z\",\"project-id\":\"\",\"id\":\"849be699-1bd8-4a01-a2f0-b95a39d3d1b6\",\"type\":3,\"status\":2,\"node-id\":\"03a1c738-43b0-41a4-9e95-609403f8035c\",\"addr\":\"127.0.0.1:10241\",\"epoch\":1766,\"config\":\"ewogICAgImpvYi1uYW1lIjogInRlc3QtZmFrZS1qb2ItMiIsCiAgICAid29ya2VyLWNvdW50IjogMywKICAgICJ0YXJnZXQtdGljayI6IDYwMDAwLAogICAgImV0Y2Qtd2F0Y2gtZW5hYmxlIjogdHJ1ZSwKICAgICJldGNkLWVuZHBvaW50cyI6IFsiMTI3LjAuMC4xOjEyNDc5Il0sCiAgICAiZXRjZC13YXRjaC1wcmVmaXgiOiAiL2Zha2UtZGVtby90ZXN0LTIvIgp9Cg==\"}"] [cost=1] [resources="[]"] [master-id=dataflow-engine-job-manager]
[2022/05/05 18:21:51.719 +08:00] [INFO] [job_fsm.go:208] ["tombstone job master doesn't receive heartbeat in time, recreate it"] [job="{\"WorkerHandle\":null,\"seq-id\":0,\"created-at\":\"0001-01-01T00:00:00Z\",\"updated-at\":\"0001-01-01T00:00:00Z\",\"project-id\":\"\",\"id\":\"849be699-1bd8-4a01-a2f0-b95a39d3d1b6\",\"type\":3,\"status\":2,\"node-id\":\"03a1c738-43b0-41a4-9e95-609403f8035c\",\"addr\":\"127.0.0.1:10241\",\"epoch\":1766,\"config\":\"ewogICAgImpvYi1uYW1lIjogInRlc3QtZmFrZS1qb2ItMiIsCiAgICAid29ya2VyLWNvdW50IjogMywKICAgICJ0YXJnZXQtdGljayI6IDYwMDAwLAogICAgImV0Y2Qtd2F0Y2gtZW5hYmxlIjogdHJ1ZSwKICAgICJldGNkLWVuZHBvaW50cyI6IFsiMTI3LjAuMC4xOjEyNDc5Il0sCiAgICAiZXRjZC13YXRjaC1wcmVmaXgiOiAiL2Zha2UtZGVtby90ZXN0LTIvIgp9Cg==\"}"]
[2022/05/05 18:21:51.720 +08:00] [INFO] [client_manager.go:61] ["client manager adds executor"] [id=562a8719-8466-408a-b0c9-54d4320459ae] [addr=127.0.0.1:10241]
[2022/05/05 18:21:51.722 +08:00] [PANIC] [worker_manager.go:319] ["worker already exists"] [worker-id=849be699-1bd8-4a01-a2f0-b95a39d3d1b6] [stack="github.com/hanfei1991/microcosm/lib/master.(*WorkerManager).BeforeStartingWorker\n\t/home/apple/work/code/microcosm/lib/master/worker_manager.go:319\ngithub.com/hanfei1991/microcosm/lib.(*DefaultBaseMaster).CreateWorker.func1.2\n\t/home/apple/work/code/microcosm/lib/master.go:558\ngithub.com/hanfei1991/microcosm/client.(*TaskDispatcher).DispatchTask\n\t/home/apple/work/code/microcosm/client/task_dispatcher.go:77\ngithub.com/hanfei1991/microcosm/lib.(*DefaultBaseMaster).CreateWorker.func1\n\t/home/apple/work/code/microcosm/lib/master.go:557"]
panic: worker already exists

goroutine 1101 [running]:
go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc000778000, {0xc000dffcc0, 0x1, 0x1})
        /home/apple/.gvm/pkgsets/go1.18/global/pkg/mod/go.uber.org/zap@v1.21.0/zapcore/entry.go:232 +0x44c
go.uber.org/zap.(*Logger).Panic(0x1b07720?, {0x1e435a9?, 0xc001e1d710?}, {0xc000dffcc0, 0x1, 0x1})
        /home/apple/.gvm/pkgsets/go1.18/global/pkg/mod/go.uber.org/zap@v1.21.0/logger.go:230 +0x59
github.com/hanfei1991/microcosm/lib/master.(*WorkerManager).BeforeStartingWorker(0xc000e49d40, {0xc001e1d710, 0x24}, {0xc001db5ad0, 0x24})
        /home/apple/work/code/microcosm/lib/master/worker_manager.go:319 +0x1fa
github.com/hanfei1991/microcosm/lib.(*DefaultBaseMaster).CreateWorker.func1.2()
        /home/apple/work/code/microcosm/lib/master.go:558 +0x34
github.com/hanfei1991/microcosm/client.(*TaskDispatcher).DispatchTask(0xc001d94900?, {0x22ae3c0, 0xc00099b020}, 0xc000c2a3c0, 0xc00066d2c0, 0xc002109fc0)
        /home/apple/work/code/microcosm/client/task_dispatcher.go:77 +0x12d
github.com/hanfei1991/microcosm/lib.(*DefaultBaseMaster).CreateWorker.func1()
        /home/apple/work/code/microcosm/lib/master.go:557 +0x74c
created by github.com/hanfei1991/microcosm/lib.(*DefaultBaseMaster).CreateWorker
        /home/apple/work/code/microcosm/lib/master.go:518 +0x87e