ArroyoSystems / arroyo

Distributed stream processing engine in Rust
https://arroyo.dev
Apache License 2.0
3.81k stars 220 forks source link

Arroyo Deployment Issue in Kubernetes #728

Open tjqc0512 opened 2 months ago

tjqc0512 commented 2 months ago

I deployed arroyo in Kubernetes using helm. After creating a pipeline, I manually deleted the worker-pod, but the worker-pod did not restart successfully. arroyo version: 0.11.3 kubernets version: 1.23.4 The arroyo-controller logs reported the following error:

{"timestamp":"2024-08-30T02:15:47.893698Z","level":"INFO","fields":{"message":"state transition","job_id":"job_IEzecx5ee6","from":"Running","to":"Recovering","duration_ms":"301261"},"target":"arroyo_controller::states"}
{"timestamp":"2024-08-30T02:15:49.627405Z","level":"INFO","fields":{"message":"stopping job","job_id":"job_IEzecx5ee6"},"target":"arroyo_controller::states::recovering"}
{"timestamp":"2024-08-30T02:17:19.629363Z","level":"WARN","fields":{"message":"failed to stop job","error":"status: Cancelled, message: \"Timeout expired\", details: [], metadata: MetadataMap { headers: {} }\n\nCaused by:\n 0: transport error\n 1: Timeout expired","job_id":"job_IEzecx5ee6"},"target":"arroyo_controller::states::recovering"}
{"timestamp":"2024-08-30T02:17:19.679602Z","level":"INFO","fields":{"message":"state transition","job_id":"job_IEzecx5ee6","from":"Recovering","to":"Compiling","duration_ms":"91785"},"target":"arroyo_controller::states"}
{"timestamp":"2024-08-30T02:17:19.690315Z","level":"INFO","fields":{"message":"state transition","job_id":"job_IEzecx5ee6","from":"Compiling","to":"Scheduling","duration_ms":"10"},"target":"arroyo_controller::states"}
{"timestamp":"2024-08-30T02:17:19.710782Z","level":"INFO","fields":{"job_id":"job_IEzecx5ee6","message":"starting workers on k8s","replicas":1,"task_slots":1},"target":"arroyo_controller::schedulers::kubernetes"}
{"timestamp":"2024-08-30T02:17:19.711137Z","level":"INFO","fields":{"job_id":"job_IEzecx5ee6","message":"starting workers on k8s","replicas":1,"task_slots":1},"target":"arroyo_controller::schedulers::kubernetes"}
{"timestamp":"2024-08-30T02:17:19.711144Z","level":"INFO","fields":{"job_id":"job_IEzecx5ee6","message":"starting worker","pod":"my-arroyo-worker-job-iezecx5ee6-2-0"},"target":"arroyo_controller::schedulers::kubernetes"}
{"timestamp":"2024-08-30T02:17:34.109304Z","level":"INFO","fields":{"message":"Worker registered: RegisterWorkerReq { worker_id: 2793387234007737385, node_id: 0, job_id: \"job_IEzecx5ee6\", rpc_address: \"http://172.16.136.62:6900\", data_address: \"172.16.136.62:38869\", resources: Some(WorkerResources { slots: 8 }), slots: 1 } -- Some(172.16.136.62:44582)"},"target":"arroyo_controller"}
{"timestamp":"2024-08-30T02:17:34.109414Z","level":"INFO","fields":{"message":"connecting to worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"rpc_address":"http://172.16.136.62:6900"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.110356Z","level":"INFO","fields":{"message":"restoring checkpoint","job_id":"job_IEzecx5ee6","epoch":18,"min_epoch":12},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.111128Z","level":"INFO","fields":{"message":"starting execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.115330Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":0,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(1), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.217581Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":1,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(3), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.319368Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":2,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(5), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.421507Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":3,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(7), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.523622Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":4,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(9), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.625573Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":5,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(11), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.727750Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":6,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(13), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.829990Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":7,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(15), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:34.932011Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":8,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(17), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:35.034056Z","level":"ERROR","fields":{"message":"failed to start execution on worker","job_id":"job_IEzecx5ee6","worker_id":2793387234007737385,"attempt":9,"error":"Status { code: Cancelled, message: \"h2 protocol error: http2 error: stream error received: stream no longer needed\", source: Some(tonic::transport::Error(Transport, hyper::Error(Http2, Error { kind: Reset(StreamId(19), CANCEL, Remote) }))) }"},"target":"arroyo_controller::states::scheduling"}
{"timestamp":"2024-08-30T02:17:35.135364Z","level":"ERROR","fields":{"message":"panicked at crates/arroyo-controller/src/states/scheduling.rs:488:21:\nFailed to start execution on workers WorkerId(2793387234007737385)","panic.file":"crates/arroyo-controller/src/states/scheduling.rs","panic.line":488,"panic.column":21},"target":"arroyo_server_common"}
{"timestamp":"2024-08-30T02:17:35.135414Z","level":"ERROR","fields":{"message":"fatal state error","job_id":"job_IEzecx5ee6","state":"Scheduling","error_message":"Failed to start cluster for pipeline","error":"task 13920 panicked"},"target":"arroyo_controller::states"}