neondatabase / neon

Neon: Serverless Postgres. We separated storage and compute to offer autoscaling, code-like database branching, and scale to zero.
https://neon.tech
Apache License 2.0
14.83k stars 432 forks source link

safekeeper: `WAL removal task failed: No such file or directory` #8972

Open problame opened 1 month ago

problame commented 1 month ago

Two ocurences during this week's release-6594

Likely a race condition / lifecycle issue, e.g., timeline deletion?.

Maybe same condition as https://github.com/neondatabase/neon/issues/8974

https://neonprod.grafana.net/explore?schemaVersion=1&panes=%7B%22rhc%22:%7B%22datasource%22:%22grafanacloud-logs%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bneon_service%3D%5C%22safekeeper%5C%22%7D%20%7C%3D%20%60WAL%20removal%20task%20failed:%20No%20such%20file%20or%20directory%20%60%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22grafanacloud-logs%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%221725895485871%22,%22to%22:%221725895786037%22%7D%7D%7D&orgId=1

2024-09-09T15:28:45.729163Z  WARN manager{ttid=$TTID}: WAL removal task failed: No such file or directory (os error 2)
Stack backtrace:
   0: <core::result::Result<T,F> as core::ops::try_trait::FromResidual<core::result::Result<core::convert::Infallible,E>>>::from_residual
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/core/src/result.rs:1989:27
      safekeeper::wal_storage::remove_segments_from_disk::{{closure}}
             at /home/nonroot/safekeeper/src/wal_storage.rs:538:23
      <safekeeper::wal_storage::PhysicalStorage as safekeeper::wal_storage::Storage>::remove_up_to::{{closure}}
             at /home/nonroot/safekeeper/src/wal_storage.rs:510:90
   1: <core::pin::Pin<P> as core::future::future::Future>::poll
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/core/src/future/future.rs:123:9
      safekeeper::timeline_manager::Manager::update_wal_removal::{{closure}}::{{closure}}
             at /home/nonroot/safekeeper/src/timeline_manager.rs:537:29
      <tracing::instrument::Instrumented<T> as core::future::future::Future>::poll
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tracing-0.1.37/src/instrument.rs:272:9
   2: tokio::runtime::task::core::Core<T,S>::poll::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/task/core.rs:328:17
      tokio::loom::std::unsafe_cell::UnsafeCell<T>::with_mut
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/loom/std/unsafe_cell.rs:16:9
      tokio::runtime::task::core::Core<T,S>::poll
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/task/core.rs:317:30
   3: tokio::runtime::task::harness::poll_future::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/task/harness.rs:485:19
      <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/core/src/panic/unwind_safe.rs:272:9
      std::panicking::try::do_call
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panicking.rs:557:40
      std::panicking::try
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panicking.rs:521:19
      std::panic::catch_unwind
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panic.rs:350:14
      tokio::runtime::task::harness::poll_future
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/task/harness.rs:473:18
      tokio::runtime::task::harness::Harness<T,S>::poll_inner
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/task/harness.rs:208:27
      tokio::runtime::task::harness::Harness<T,S>::poll
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/task/harness.rs:153:15
   4: tokio::runtime::task::LocalNotified<S>::run
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/task/mod.rs:427:9
      tokio::runtime::scheduler::current_thread::CoreGuard::block_on::{{closure}}::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:700:30
      tokio::runtime::coop::with_budget
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/coop.rs:107:5
      tokio::runtime::coop::budget
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/coop.rs:73:5
      tokio::runtime::scheduler::current_thread::Context::run_task::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:343:43
      tokio::runtime::scheduler::current_thread::Context::enter
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:404:19
      tokio::runtime::scheduler::current_thread::Context::run_task
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:343:23
      tokio::runtime::scheduler::current_thread::CoreGuard::block_on::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:699:43
      tokio::runtime::scheduler::current_thread::CoreGuard::enter::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:737:68
      tokio::runtime::context::scoped::Scoped<T>::set
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/context/scoped.rs:40:9
      tokio::runtime::context::set_scheduler::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/context.rs:176:26
      std::thread::local::LocalKey<T>::try_with
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/thread/local.rs:283:12
      std::thread::local::LocalKey<T>::with
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/thread/local.rs:260:9
      tokio::runtime::context::set_scheduler
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/context.rs:176:17
      tokio::runtime::scheduler::current_thread::CoreGuard::enter
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:737:27
      tokio::runtime::scheduler::current_thread::CoreGuard::block_on
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:646:19
   5: tokio::runtime::scheduler::current_thread::CurrentThread::block_on::{{closure}}
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:175:28
      tokio::runtime::context::runtime::enter_runtime
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/context/runtime.rs:65:16
   6: tokio::runtime::scheduler::current_thread::CurrentThread::block_on
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/scheduler/current_thread/mod.rs:167:9
      tokio::runtime::runtime::Runtime::block_on
             at /home/nonroot/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.37.0/src/runtime/runtime.rs:349:47
   7: safekeeper::main
             at /home/nonroot/safekeeper/src/bin/safekeeper.rs:368:5
   8: core::ops::function::FnOnce::call_once
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/core/src/ops/function.rs:250:5
      std::sys::backtrace::__rust_begin_short_backtrace
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/sys/backtrace.rs:152:18
   9: std::rt::lang_start::{{closure}}
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/rt.rs:162:18
  10: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/core/src/ops/function.rs:284:13
      std::panicking::try::do_call
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panicking.rs:557:40
      std::panicking::try
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panicking.rs:521:19
      std::panic::catch_unwind
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panic.rs:350:14
      std::rt::lang_start_internal::{{closure}}
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/rt.rs:141:48
      std::panicking::try::do_call
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panicking.rs:557:40
      std::panicking::try
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panicking.rs:521:19
      std::panic::catch_unwind
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/panic.rs:350:14
      std::rt::lang_start_internal
             at /rustc/eeb90cda1969383f56a2637cbd3037bdf598841c/library/std/src/rt.rs:141:20
  11: main
  12: <unknown>
  13: __libc_start_main
  14: _start
arssher commented 1 month ago

Yes, very likely a race with timeline deletion, it should be made graceful (waiting for bgw shutdown).