risingwavelabs / risingwave

Best-in-class stream processing, analytics, and management. Perform continuous analytics, or build event-driven applications, real-time ETL pipelines, and feature stores in minutes. Unified streaming and batch. PostgreSQL compatible.
https://go.risingwave.com/slack
Apache License 2.0
6.88k stars 569 forks source link

random ci failure due to minio (probably disk full) #17029

Closed xxchan closed 1 month ago

xxchan commented 4 months ago

Describe the bug

https://buildkite.com/risingwavelabs/pull-request/builds/50773#018fc8a5-5591-4d63-9287-ce36861e709b

Error message/log

2024-05-30T08:49:34.595358425Z ERROR risingwave_stream::task::stream_manager: actor exit with error actor_id=140 error=Executor error: Storage error: Hummock error: Foyer error: ObjectStore failed with IO error: s3 error: streaming error

Backtrace:
   0: capture
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/thiserror-ext-0.1.2/src/backtrace.rs:30:18
   1: thiserror_ext::ptr::ErrorBox<T,B>::new
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/thiserror-ext-0.1.2/src/ptr.rs:40:33
   2: <risingwave_storage::hummock::error::HummockError as core::convert::From<E>>::from
             at ./src/storage/src/hummock/error.rs:21:45
   3: <T as core::convert::Into<U>>::into
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/convert/mod.rs:759:9
   4: risingwave_storage::hummock::error::HummockError::foyer_error
             at ./src/storage/src/hummock/error.rs:152:9
   5: core::ops::function::FnOnce::call_once
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/ops/function.rs:250:5
   6: core::result::Result<T,E>::map_err
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/result.rs:829:27
   7: {async_fn#0}
             at ./src/storage/src/hummock/block_cache.rs:103:33
   8: {async_fn#0}
             at ./src/storage/src/hummock/sstable_store.rs:488:57
   9: {async_fn#0}
             at ./src/storage/src/hummock/sstable/forward_sstable_iterator.rs:221:18
  10: {async_fn#0}
             at ./src/storage/src/hummock/sstable/forward_sstable_iterator.rs:284:45
  11: {async_fn#0}
             at ./src/storage/src/hummock/mod.rs:112:25
  12: {async_fn#0}
             at ./src/storage/src/hummock/store/version.rs:629:22
  13: {async_fn#0}
             at ./src/storage/src/hummock/store/local_hummock_storage.rs:133:14
  14: {async_fn#0}<risingwave_storage::hummock::store::local_hummock_storage::LocalHummockStorage>
             at ./src/storage/src/hummock/utils.rs:424:68
  15: {async_fn#0}
             at ./src/storage/src/hummock/store/local_hummock_storage.rs:422:26
  16: {async_fn#0}<risingwave_storage::hummock::store::local_hummock_storage::LocalHummockStorage, risingwave_storage::mem_table::MemtableLocalStateStore<risingwave_storage::memory::RangeKvStateStore<risingwave_storage::memory::sled::SledRangeKv>>>
             at ./src/storage/src/store_impl.rs:501:33
  17: {async_block#0}<risingwave_storage::store_impl::verify::VerifyStateStore<risingwave_storage::hummock::store::local_hummock_storage::LocalHummockStorage, risingwave_storage::mem_table::MemtableLocalStateStore<risingwave_storage::memory::RangeKvStateStore<risingwave_storage::memory::sled::SledRangeKv>>, ()>>
             at ./src/storage/src/store_impl.rs:1027:26
  18: poll<alloc::boxed::Box<(dyn core::future::future::Future<Output=core::result::Result<usize, risingwave_storage::error::StorageError>> + core::marker::Send), alloc::alloc::Global>>
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/future/future.rs:123:9
  19: poll<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=core::result::Result<usize, risingwave_storage::error::StorageError>> + core::marker::Send), alloc::alloc::Global>>, true>
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/await-tree-0.2.1/src/future.rs:113:39
  20: {async_fn#0}<risingwave_storage::monitor::monitored_store::MonitoredStateStore<alloc::boxed::Box<dyn risingwave_storage::store_impl::boxed_state_store::DynamicDispatchedStateStore, alloc::alloc::Global>>, risingwave_common::util::value_encoding::BasicSerde, false, risingwave_stream::common::table::watermark::WatermarkBufferByEpoch<300>, false>
             at ./src/stream/src/common/table/state_table.rs:1342:34
  21: poll<risingwave_stream::common::table::state_table::{impl#5}::seal_current_epoch::{async_fn_env#0}<risingwave_storage::monitor::monitored_store::MonitoredStateStore<alloc::boxed::Box<dyn risingwave_storage::store_impl::boxed_state_store::DynamicDispatchedStateStore, alloc::alloc::Global>>, risingwave_common::util::value_encoding::BasicSerde, false, risingwave_stream::common::table::watermark::WatermarkBufferByEpoch<300>, false>>
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tracing-0.1.40/src/instrument.rs:321:9
  22: {async_fn#0}<risingwave_storage::monitor::monitored_store::MonitoredStateStore<alloc::boxed::Box<dyn risingwave_storage::store_impl::boxed_state_store::DynamicDispatchedStateStore, alloc::alloc::Global>>, risingwave_common::util::value_encoding::BasicSerde, false, risingwave_stream::common::table::watermark::WatermarkBufferByEpoch<300>, false>
             at ./src/stream/src/common/table/state_table.rs:1197:18
  23: {async_fn#0}<risingwave_storage::monitor::monitored_store::MonitoredStateStore<alloc::boxed::Box<dyn risingwave_storage::store_impl::boxed_state_store::DynamicDispatchedStateStore, alloc::alloc::Global>>, risingwave_common::util::value_encoding::BasicSerde, false, risingwave_stream::common::table::watermark::WatermarkBufferByEpoch<300>, false>
             at ./src/stream/src/common/table/state_table.rs:1141:44
  24: {async_fn#0}<risingwave_common::hash::key_v2::HashKeyImpl<risingwave_common::hash::key_v2::StackStorage<8>, risingwave_common::hash::key::StackNullBitmap>, risingwave_storage::monitor::monitored_store::MonitoredStateStore<alloc::boxed::Box<dyn risingwave_storage::store_impl::boxed_state_store::DynamicDispatchedStateStore, alloc::alloc::Global>>>
             at ./src/stream/src/executor/join/hash_join.rs:500:40
  25: {async_fn#0}<risingwave_common::hash::key_v2::HashKeyImpl<risingwave_common::hash::key_v2::StackStorage<8>, risingwave_common::hash::key::StackNullBitmap>, risingwave_storage::monitor::monitored_store::MonitoredStateStore<alloc::boxed::Box<dyn risingwave_storage::store_impl::boxed_state_store::DynamicDispatchedStateStore, alloc::alloc::Global>>, 1>
             at ./src/stream/src/executor/hash_join.rs:619:37
  26: {coroutine#0}<risingwave_common::hash::key_v2::HashKeyImpl<risingwave_common::hash::key_v2::StackStorage<8>, risingwave_common::hash::key::StackNullBitmap>, risingwave_storage::monitor::monitored_store::MonitoredStateStore<alloc::boxed::Box<dyn risingwave_storage::store_impl::boxed_state_store::DynamicDispatchedStateStore, alloc::alloc::Global>>, 1>
             at ./src/stream/src/executor/hash_join.rs:585:52

To Reproduce

No response

Expected behavior

No response

How did you deploy RisingWave?

No response

The version of RisingWave

No response

Additional context

No response

xxchan commented 4 months ago

@MrCroxx any ideas?

xxchan commented 3 months ago

https://buildkite.com/risingwavelabs/pull-request/builds/53308#01907c6a-a4b0-4070-8054-3032ee352230

2024-07-04T06:40:28.471937055Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:40:28.596721091Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:40:32.619198746Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=24
2024-07-04T06:40:36.182351911Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=36
2024-07-04T06:40:36.767997247Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=36
2024-07-04T06:40:39.579552348Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=24
2024-07-04T06:40:40.924250286Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=26
2024-07-04T06:40:42.111405494Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=36
2024-07-04T06:40:42.268705798Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=24
2024-07-04T06:40:49.37329002Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=36
2024-07-04T06:40:51.270639726Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=36
2024-07-04T06:40:51.29197257Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:40:51.293766232Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:41:02.31908782Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=32
2024-07-04T06:41:18.789727389Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:41:18.85689259Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:41:19.071700416Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=16
2024-07-04T06:41:31.193326127Z  WARN risingwave_storage::hummock::event_handler::hummock_event_handler: cannot acquire lock for all read version pending_count=1 total_count=22
2024-07-04T06:41:39.820986635Z ERROR risingwave_object_store::object: read failed error=s3 error: streaming error: error reading a body from connection: end of file before message length reached
2024-07-04T06:41:39.821066353Z ERROR risingwave_storage::hummock::sstable_store: get_block_response meet error when read 6126329..6191796 from sst-270, total length: 13012117
2024-07-04T06:41:39.831488488Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:41:39.833972262Z ERROR risingwave_stream::task::stream_manager: actor exit with error actor_id=131 error=Executor error: exchange channel to downstream actor 127 closed unexpectedly
2024-07-04T06:41:39.868356033Z  WARN risingwave_connector::source::data_gen_util: failed to send next event to reader, exit
2024-07-04T06:41:39.868611179Z ERROR risingwave_stream::task::stream_manager: actor exit with error actor_id=129 error=Executor error: exchange channel to downstream actor 127 closed unexpectedly
2024-07-04T06:41:42.835122941Z  WARN risingwave_stream::task::barrier_manager: control stream reset with error error=gRPC request failed: Internal error: failed to collect barrier for epoch [6739728739991552, 6739728756375552]: Actor 129 exited unexpectedly: Executor error: exchange channel to downstream actor 127 closed unexpectedly
2024-07-04T06:41:39.863750569Z ERROR risingwave_stream::task::stream_manager: actor exit with error actor_id=128 error=Executor error: Storage error: Hummock error: Foyer error: channel closed

Backtrace:
   0: capture
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/thiserror-ext-0.1.2/src/backtrace.rs:30:18
   1: thiserror_ext::ptr::ErrorBox<T,B>::new
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/thiserror-ext-0.1.2/src/ptr.rs:40:33
   2: <risingwave_storage::hummock::error::HummockError as core::convert::From<E>>::from
             at ./src/storage/src/hummock/error.rs:22:45
   3: <T as core::convert::Into<U>>::into
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/convert/mod.rs:759:9
   4: risingwave_storage::hummock::error::HummockError::foyer_error
             at ./src/storage/src/hummock/error.rs:162:9
   5: core::ops::function::FnOnce::call_once
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/ops/function.rs:250:5
   6: core::result::Result<T,E>::map_err
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/result.rs:829:27
   7: {async_fn#0}
             at ./src/storage/src/hummock/block_cache.rs:98:33
   8: {async_fn#0}
             at ./src/storage/src/hummock/sstable_store.rs:515:57
   9: {async_fn#0}
             at ./src/storage/src/hummock/sstable/forward_sstable_iterator.rs:221:18
  10: {async_fn#0}
             at ./src/storage/src/hummock/sstable/forward_sstable_iterator.rs:284:45
  11: {async_fn#0}
             at ./src/storage/src/hummock/mod.rs:112:25
hzxa21 commented 3 months ago

minio issue: https://buildkiteartifacts.com/28c1f7f6-3369-436d-b28d-1d0cb766d6c1/9eed51d0-eaaf-4ea3-9[…]2c3d0ad6e42aecc60b4c3c97976cba81a1143ca467cd8edbaa9ded66e

API: GetObject(bucket=hummock001, object=hummock_001/47/270.data)
Time: 06:41:39 UTC 07/04/2024
DeploymentID: 1b2b145c-896e-4004-88f3-f9a90a473644
RequestID: 17DEEF5F1EFAC676
RemoteHost: 127.0.0.1
Host: 127.0.0.1:9301
UserAgent: aws-sdk-rust/1.3.0 os/linux lang/rust/1.78.0-nightly
Error: Unable to write all the data to client: Storage resources are insufficient for the read operation hummock001/hummock_001/47/270.data (*fmt.wrapError)
       4: internal/logger/logger.go:258:logger.LogIf()
       3: cmd/object-handlers.go:548:cmd.objectAPIHandlers.getObjectHandler()
       2: cmd/object-handlers.go:605:cmd.objectAPIHandlers.GetObjectHandler()
       1: net/http/server.go:2122:http.HandlerFunc.ServeHTTP()

Probably disk full

xxchan commented 2 months ago

Meet another Error: https://buildkite.com/risingwavelabs/pull-request/builds/54649#0190fdb3-bb6e-4243-85c5-5d6998c9c0d2

minio:

API: PutBucketLifecycle(bucket=hummock001)
Time: 09:10:44 UTC 07/29/2024
DeploymentID: bfd605f5-4c3a-43dc-a647-a0a5f49b5f73
RequestID: 17E6A403E46F1B2B
RemoteHost: 127.0.0.1
Host: 127.0.0.1:9301
UserAgent: aws-sdk-rust/1.3.0 os/linux lang/rust/1.78.0-nightly
Error: The XML you provided was not well-formed or did not validate against our published schema (lifecycle.Error)
       5: internal/logger/logger.go:258:logger.LogIf()
       4: cmd/api-errors.go:2282:cmd.toAPIErrorCode()
       3: cmd/api-errors.go:2307:cmd.toAPIError()
       2: cmd/bucket-lifecycle-handlers.go:78:cmd.objectAPIHandlers.PutBucketLifecycleHandler()
       1: net/http/server.go:2122:http.HandlerFunc.ServeHTTP()

compute node:

2024-07-29T09:16:21.756735595Z ERROR risingwave_stream::task::stream_manager: actor exit with error actor_id=154 error=Executor error: Storage error: Hummock error: Foyer error: ObjectStore failed with IO error: Timeout error: Retry attempts exhausted for read. Please modify read_attempt_timeout_ms (current=16000) and read_retry_attempts (current=6) under [storage.object_store.retry] in the config accordingly if needed.

Backtrace:
   0: capture
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/thiserror-ext-0.1.2/src/backtrace.rs:30:18
   1: thiserror_ext::ptr::ErrorBox<T,B>::new
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/thiserror-ext-0.1.2/src/ptr.rs:40:33
   2: <risingwave_storage::hummock::error::HummockError as core::convert::From<E>>::from
             at ./src/storage/src/hummock/error.rs:22:45
   3: <T as core::convert::Into<U>>::into
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/convert/mod.rs:759:9
   4: risingwave_storage::hummock::error::HummockError::foyer_error
             at ./src/storage/src/hummock/error.rs:162:9
   5: core::ops::function::FnOnce::call_once
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/ops/function.rs:250:5
   6: core::result::Result<T,E>::map_err
             at /rustc/4a0cc881dcc4d800f10672747f61a94377ff6662/library/core/src/result.rs:829:27
   7: {async_fn#0}
             at ./src/storage/src/hummock/block_cache.rs:103:33
   8: {async_fn#0}
             at ./src/storage/src/hummock/sstable_store.rs:515:57
   9: {async_fn#0}
             at ./src/storage/src/hummock/sstable/forward_sstable_iterator.rs:221:18
  10: {async_fn#0}
             at ./src/storage/src/hummock/sstable/forward_sstable_iterator.rs:284:45
MrCroxx commented 1 month ago

No more observation. Close for now. Reopen anytime if the error occurs again.