Stack overflow with rayon::scope and spawn

mdesmedt commented 3 years ago

Here is a simple piece of code which causes a stack overflow with rayon 1.5.0:

use rayon::prelude::*;

fn do_work(index: u32) {
    if index == 123015453 {
        println!("Hello world");
    }
}

fn main() {
    rayon::scope(|s| {
        let num_blocks: u32 = 10000;
        for block in 0..num_blocks {
            s.spawn(move |_| {
                let num_pixels: u32 = block;
                (0..num_pixels).into_par_iter().for_each(|index| {
                    do_work(index);
                }); // par_iter over pixels
            }); // spawn
        } // loop blocks
    }); // scope
}

The callstack repeats this pattern:

...
static void rayon_core::registry::WorkerThread::execute(struct rayon_core::job::JobRef)
void rayon_core::registry::WorkerThread::wait_until_cold(struct rayon_core::latch::CoreLatch *)
void rayon_core::registry::WorkerThread::wait_until<rayon_core::latch::SpinLatch>(struct rayon_core::latch::SpinLatch *)
void rayon_core::join::join_context::{{closure}}<closure-0,closure-1,tuple<>,tuple<>>(struct rayon_core::join::join_context::closure-0, struct rayon_core::registry::WorkerThread *, bool)
void rayon_core::registry::in_worker<closure-0,tuple<tuple<>, tuple<>>>(struct rayon_core::join::join_context::closure-0)
void rayon_core::join::join_context<closure-0,closure-1,tuple<>,tuple<>>(struct rayon::iter::plumbing::bridge_producer_consumer::helper::closure-0, struct rayon::iter::plumbing::bridge_producer_consumer::helper::closure-1)
static void rayon::iter::plumbing::bridge_producer_consumer::helper<rayon::range::IterProducer<u32>,rayon::iter::for_each::ForEachConsumer<closure-0>>(unsigned __int64, bool, struct rayon::iter::plumbing::LengthSplitter, struct rayon::range::IterProducer<u32>, struct rayon::iter::for_each::ForEachConsumer<closure-0>)
void rayon::iter::plumbing::bridge_producer_consumer::helper::{{closure}}<rayon::range::IterProducer<u32>,rayon::iter::for_each::ForEachConsumer<closure-0>>(struct rayon::iter::plumbing::bridge_producer_consumer::helper::closure-1, struct rayon_core::FnContext)
void rayon_core::join::join_context::call_b::{{closure}}<tuple<>,closure-1>(struct rayon_core::join::join_context::call_b::closure-0, bool)
void rayon_core::job::StackJob<rayon_core::latch::SpinLatch, closure-0, tuple<>>::run_inline<rayon_core::latch::SpinLatch,closure-0,tuple<>>(struct rayon_core::job::StackJob<rayon_core::latch::SpinLatch, closure-0, tuple<>>, bool)
void rayon_core::join::join_context::{{closure}}<closure-0,closure-1,tuple<>,tuple<>>(struct rayon_core::join::join_context::closure-0, struct rayon_core::registry::WorkerThread *, bool)
void rayon_core::registry::in_worker<closure-0,tuple<tuple<>, tuple<>>>(struct rayon_core::join::join_context::closure-0)
void rayon_core::join::join_context<closure-0,closure-1,tuple<>,tuple<>>(struct rayon::iter::plumbing::bridge_producer_consumer::helper::closure-0, struct rayon::iter::plumbing::bridge_producer_consumer::helper::closure-1)
static void rayon::iter::plumbing::bridge_producer_consumer::helper<rayon::range::IterProducer<u32>,rayon::iter::for_each::ForEachConsumer<closure-0>>(unsigned __int64, bool, struct rayon::iter::plumbing::LengthSplitter, struct rayon::range::IterProducer<u32>, struct rayon::iter::for_each::ForEachConsumer<closure-0>)
static void rayon::iter::plumbing::bridge_producer_consumer<rayon::range::IterProducer<u32>,rayon::iter::for_each::ForEachConsumer<closure-0>>(unsigned __int64, struct rayon::range::IterProducer<u32>, struct rayon::iter::for_each::ForEachConsumer<closure-0>)
void rayon::iter::plumbing::bridge::{{impl}}::callback<rayon::iter::for_each::ForEachConsumer<closure-0>,u32,rayon::range::IterProducer<u32>>(struct rayon::iter::plumbing::bridge::Callback<rayon::iter::for_each::ForEachConsumer<closure-0>>, struct rayon::range::IterProducer<u32>)
void rayon::range::{{impl}}::with_producer<rayon::iter::plumbing::bridge::Callback<rayon::iter::for_each::ForEachConsumer<closure-0>>>(struct rayon::range::Iter<u32>, struct rayon::iter::plumbing::bridge::Callback<rayon::iter::for_each::ForEachConsumer<closure-0>>)
void rayon::iter::plumbing::bridge<rayon::range::Iter<u32>,rayon::iter::for_each::ForEachConsumer<closure-0>>(struct rayon::range::Iter<u32>, struct rayon::iter::for_each::ForEachConsumer<closure-0>)
void rayon::range::{{impl}}::drive_unindexed<rayon::iter::for_each::ForEachConsumer<closure-0>>(struct rayon::range::Iter<u32>, struct rayon::iter::for_each::ForEachConsumer<closure-0>)
rayon::iter::for_each::for_each::h15817c5d0f369c94
rayon::iter::ParallelIterator::for_each::h26990b208229649d
void rayon_stack_overflow::main::{{closure}}::{{closure}}(struct rayon_stack_overflow::main::{{closure}}::closure-0, struct rayon_core::scope::Scope *)
void rayon_core::scope::{{impl}}::spawn::{{closure}}::{{closure}}<closure-0>(struct rayon_core::scope::{{impl}}::spawn::{{closure}}::closure-0)
void std::panic::{{impl}}::call_once<tuple<>,closure-0>(struct std::panic::AssertUnwindSafe<closure-0>)
static void std::panicking::try::do_call<std::panic::AssertUnwindSafe<closure-0>,tuple<>>(unsigned char *)
1400047C7 (@7ff7fab547c7..7ff7fab5482d:3)
union core::result::Result<tuple<>, alloc::boxed::Box<Any, alloc::alloc::Global>> std::panicking::try<tuple<>,std::panic::AssertUnwindSafe<closure-0>>(struct std::panic::AssertUnwindSafe<closure-0>)
union core::result::Result<tuple<>, alloc::boxed::Box<Any, alloc::alloc::Global>> std::panic::catch_unwind<std::panic::AssertUnwindSafe<closure-0>,tuple<>>(struct std::panic::AssertUnwindSafe<closure-0>)
union core::result::Result<tuple<>, alloc::boxed::Box<Any, alloc::alloc::Global>> rayon_core::unwind::halt_unwinding<closure-0,tuple<>>(struct rayon_core::scope::{{impl}}::spawn::{{closure}}::closure-0)
static core::option::Option rayon_core::scope::ScopeBase::execute_job_closure<closure-0,tuple<>>(struct rayon_core::scope::{{impl}}::spawn::{{closure}}::closure-0)
static void rayon_core::scope::ScopeBase::execute_job<closure-0>(struct rayon_core::scope::{{impl}}::spawn::{{closure}}::closure-0)
void rayon_core::scope::{{impl}}::spawn::{{closure}}<closure-0>(struct rayon_core::scope::{{impl}}::spawn::closure-0)
static void rayon_core::job::{{impl}}::execute<closure-0>(struct rayon_core::job::HeapJob<closure-0> *)
void rayon_core::job::JobRef::execute()
static void rayon_core::registry::WorkerThread::execute(struct rayon_core::job::JobRef)
...

I'm fairly new to Rust and rayon so it might be what I'm doing is wrong, but as my code isn't obviously recursing I wouldn't expect rayon internally to recurse either. This behavior poses a danger to users of the lib as it might work in trivial cases but panic in others.

cuviper commented 3 years ago

Rayon has implicit "recursion" due to work stealing. That is, whenever a rayon thread is blocked on the result from another rayon thread, it will look for other pending work to do in the meantime. That stolen work is executed directly from the same stack where it was blocked.

Your par_iter().for_each() becomes a bunch of nested joins, and each one of those may block if one half gets stolen to a new thread. Since stealing is somewhat random, the pool will have a mix of stolen joins and new spawns creating even more joins, and I can definitely see how that might get out of control. You're not doing anything wrong, but I'm not sure how to tame that.

The new in_place_scope in #844 would probably help, assuming you start from outside the thread pool. Then each spawn will go into the pool's external queue, whereas scope runs in the pool and pushes spawns into a thread's local queue. The local thread queues have priority over the external queue when it comes to work stealing, so that would let you prioritize finishing current par_iters before starting new spawns, for a bit less of a task explosion.

cuviper commented 3 years ago

Maybe we could also have some sort of spawn_external as a way of indicating lower priority, where we push to the external queue even if you call this from within the thread pool.

cuviper commented 3 years ago

    for block in 0..num_blocks {
        s.spawn(move |_| {
            let num_pixels: u32 = block;
            (0..num_pixels).into_par_iter().for_each(|index| {
                do_work(index);
            }); // par_iter over pixels
        }); // spawn
    } // loop blocks

BTW, what do you mean by this "loop blocks" comment? The spawns do not wait for completion here, so the loop will queue up all num_blocks iterations in rapid succession. Only the end of the scope will block for all of the spawns.

mdesmedt commented 3 years ago

    for block in 0..num_blocks {
        s.spawn(move |_| {
            let num_pixels: u32 = block;
            (0..num_pixels).into_par_iter().for_each(|index| {
                do_work(index);
            }); // par_iter over pixels
        }); // spawn
    } // loop blocks
BTW, what do you mean by this "loop blocks" comment? The spawns do not wait for completion here, so the loop will queue up all num_blocks iterations in rapid succession. Only the end of the scope will block for all of the spawns.

Hah. It's not a "block" in terms of concurrency :). I derived this snippet from my little "Raytracing in one weekend" implementation. Where I generate a list of "renderblocks" to render in FIFO order. Complete overkill really for a bunch of spheres, but it looks pretty.

Thanks for your explanation. It seems like this potential for stack explosions is inherent to the current work stealing architecture of rayon. As I hit this in my code "in production" so to speak a mitigation or alternative way of expressing the work so hitting this won't happen would be great.

cuviper commented 3 years ago

Oh of course, you mean each "block" in 0..num_blocks. :sweat_smile:

rayon-rs / rayon

Stack overflow with rayon::scope and spawn #854