MaterializeInc / materialize

The data warehouse for operational workloads.
https://materialize.com
Other
5.66k stars 457 forks source link

ASan: Memory leaks in mz-storage::healthcheck::tests::health_operator_runner #26060

Open def- opened 3 months ago

def- commented 3 months ago

What version of Materialize are you using?

25352d110c2e (Pull Request #24888)

What is the issue?

==400==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 64 byte(s) in 2 object(s) allocated from:
    #0 0xaaaaece6934c in malloc.part.9 /checkout/src/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
    #1 0xaaaafb031b90 in <std::alloc::System as core::alloc::global::GlobalAlloc>::alloc /usr/local/lib/rustlib/src/rust/library/std/src/sys/unix/alloc.rs:14:13
    #2 0xaaaafaf256dc in __rdl_alloc /usr/local/lib/rustlib/src/rust/library/std/src/alloc.rs:394:13
    #3 0xaaaaedb964bc in alloc::alloc::alloc /usr/local/lib/rustlib/src/rust/library/alloc/src/alloc.rs:98:9
    #4 0xaaaaedb50a04 in <alloc::alloc::Global>::alloc_impl /usr/local/lib/rustlib/src/rust/library/alloc/src/alloc.rs:181:73
    #5 0xaaaaedba90d8 in <alloc::alloc::Global as core::alloc::Allocator>::allocate /usr/local/lib/rustlib/src/rust/library/alloc/src/alloc.rs:241:9
    #6 0xaaaaedb96130 in alloc::alloc::exchange_malloc /usr/local/lib/rustlib/src/rust/library/alloc/src/alloc.rs:330:11
    #7 0xaaaaeda0860c in <alloc::boxed::Box<mz_timely_util::builder_async::PressOnDropButton>>::new /usr/local/lib/rustlib/src/rust/library/alloc/src/boxed.rs:217:9
    #8 0xaaaaeda0860c in mz_storage::healthcheck::tests::health_operator_runner::{closure#1}::{closure#0}::{closure#0} /var/lib/buildkite-agent/builds/buildkite-builders-aarch64-585fc7f-i-044fe10e84d3606ae-1/materialize/tests/src/storage/src/healthcheck.rs:1105:39
    #9 0xaaaaee789d34 in <timely::dataflow::scopes::child::Child<timely::worker::Worker<timely_communication::allocator::generic::Generic>, ()> as timely::dataflow::scopes::Scope>::scoped::<mz_repr::timestamp::Timestamp, (), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}::{closure#0}::{closure#0}> /cargo/git/checkouts/timely-dataflow-70b80d81d6cabd62/9208396/timely/src/dataflow/scopes/child.rs:129:13
    #10 0xaaaaeda07674 in mz_storage::healthcheck::tests::health_operator_runner::{closure#1}::{closure#0} /var/lib/buildkite-agent/builds/buildkite-builders-aarch64-585fc7f-i-044fe10e84d3606ae-1/materialize/tests/src/storage/src/healthcheck.rs:1101:21
    #11 0xaaaaee7ac9f8 in <timely::worker::Worker<timely_communication::allocator::generic::Generic>>::dataflow::<(), (), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}::{closure#0}>::{closure#0} /cargo/git/checkouts/timely-dataflow-70b80d81d6cabd62/9208396/timely/src/worker.rs:567:74
    #12 0xaaaaee643c60 in <timely::worker::Worker<timely_communication::allocator::generic::Generic>>::dataflow_core::<(), (), <timely::worker::Worker<timely_communication::allocator::generic::Generic>>::dataflow<(), (), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}::{closure#0}>::{closure#0}, alloc::boxed::Box<()>> /cargo/git/checkouts/timely-dataflow-70b80d81d6cabd62/9208396/timely/src/worker.rs:640:13
    #13 0xaaaaee645fcc in <timely::worker::Worker<timely_communication::allocator::generic::Generic>>::dataflow::<(), (), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}::{closure#0}> /cargo/git/checkouts/timely-dataflow-70b80d81d6cabd62/9208396/timely/src/worker.rs:567:9
    #14 0xaaaaeda061c4 in mz_storage::healthcheck::tests::health_operator_runner::{closure#1} /var/lib/buildkite-agent/builds/buildkite-builders-aarch64-585fc7f-i-044fe10e84d3606ae-1/materialize/tests/src/storage/src/healthcheck.rs:1100:17
    #15 0xaaaaed93c4a8 in timely::execute::execute::<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1} /cargo/git/checkouts/timely-dataflow-70b80d81d6cabd62/9208396/timely/src/execute.rs:287:22
    #16 0xaaaaeeafe6e0 in timely_communication::initialize::initialize_from::<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1}>::{closure#0} /cargo/git/checkouts/timely-dataflow-70b80d81d6cabd62/9208396/communication/src/initialize.rs:316:33
    #17 0xaaaaee8d9ee0 in std::sys_common::backtrace::__rust_begin_short_backtrace::<timely_communication::initialize::initialize_from<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1}>::{closure#0}, ()> /usr/local/lib/rustlib/src/rust/library/std/src/sys_common/backtrace.rs:155:18
    #18 0xaaaaee7eba30 in <std::thread::Builder>::spawn_unchecked_::<timely_communication::initialize::initialize_from<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1}>::{closure#0}, ()>::{closure#1}::{closure#0} /usr/local/lib/rustlib/src/rust/library/std/src/thread/mod.rs:529:17
    #19 0xaaaaeeec4d80 in <core::panic::unwind_safe::AssertUnwindSafe<<std::thread::Builder>::spawn_unchecked_<timely_communication::initialize::initialize_from<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1}>::{closure#0}, ()>::{closure#1}::{closure#0}> as core::ops::function::FnOnce<()>>::call_once /usr/local/lib/rustlib/src/rust/library/core/src/panic/unwind_safe.rs:272:9
    #20 0xaaaaee6e3cbc in std::panicking::try::do_call::<core::panic::unwind_safe::AssertUnwindSafe<<std::thread::Builder>::spawn_unchecked_<timely_communication::initialize::initialize_from<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1}>::{closure#0}, ()>::{closure#1}::{closure#0}>, ()> /usr/local/lib/rustlib/src/rust/library/std/src/panicking.rs:552:40
    #21 0xaaaaee883598 in __rust_try mz_storage.e55f827222da49c1-cgu.12
    #22 0xaaaaedadad64 in std::panic::catch_unwind::<core::panic::unwind_safe::AssertUnwindSafe<<std::thread::Builder>::spawn_unchecked_<timely_communication::initialize::initialize_from<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1}>::{closure#0}, ()>::{closure#1}::{closure#0}>, ()> /usr/local/lib/rustlib/src/rust/library/std/src/panic.rs:142:14
    #23 0xaaaaed97dbf0 in <<std::thread::Builder>::spawn_unchecked_<timely_communication::initialize::initialize_from<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute<(), mz_storage::healthcheck::tests::health_operator_runner::{closure#1}>::{closure#1}>::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} /usr/local/lib/rustlib/src/rust/library/core/src/ops/function.rs:250:5
    #24 0xaaaafae79fbc in <alloc::boxed::Box<alloc::boxed::Box<dyn core::ops::function::FnOnce<(), Output = ()>>> as core::ops::function::FnOnce<()>>::call_once /usr/local/lib/rustlib/src/rust/library/alloc/src/boxed.rs:2015:9
    #25 0xaaaafae997ac in <std::sys::unix::thread::Thread>::new::thread_start /usr/local/lib/rustlib/src/rust/library/std/src/sys/unix/thread.rs:108:17
    #26 0xaaaaecdbb528 in asan_thread_start(void*) /checkout/src/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:225:42
    #27 0xffffb9da7d58 in thread_start misc/../sysdeps/unix/sysv/linux/aarch64/clone.S:79
[...]

Full text: https://gist.github.com/def-/790b769c30199f3ed2c41d84abf5ebb7 Seen in https://buildkite.com/materialize/tests/builds/78436#018e41e2-9c7e-4553-9367-0d840ae9c863 Maybe this is just not cleaned up in the test, or maybe the same would happen in production. ci-regexp: SUMMARY: AddressSanitizer: 2572 byte

guswynn commented 3 months ago

I suspect this ASAN being problematic with rust; the PressOnDropButton is immediately put into a collection in the storage state, but there might be some weird reference loop in the health operator. I'll try to find time to look more closely

benesch commented 3 months ago

Added to the storage mega tracker as a p2 flake.