A write rwlock deadlocks when there are two shared read locks present on another thread

koute commented 4 years ago

This program deadlocks:

use parking_lot::RwLock;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;

fn main() {
    let lock = Arc::new(RwLock::new(()));
    let running = Arc::new(AtomicBool::new(true));

    let l = lock.clone();
    let r = running.clone();
    let t0 = std::thread::spawn(move || {
        while r.load(Ordering::SeqCst) {
            let _a = l.read();
            let _b = l.read();
        }
    });

    let l = lock.clone();
    let r = running.clone();
    let t1 = std::thread::spawn(move || {
        while r.load(Ordering::SeqCst) {
            let _a = l.write();
        }
    });

    std::thread::sleep(std::time::Duration::from_millis(1000));

    running.store(false, Ordering::SeqCst);
    t0.join().unwrap();
    t1.join().unwrap();
}

If you comment out let _b = l.read(); then everything is fine.

Exactly the same program which uses RwLock from std doesn't deadlock:

use std::sync::RwLock;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;

fn main() {
    let lock = Arc::new(RwLock::new(()));
    let running = Arc::new(AtomicBool::new(true));

    let l = lock.clone();
    let r = running.clone();
    let t0 = std::thread::spawn(move || {
        while r.load(Ordering::SeqCst) {
            let _a = l.read().unwrap();
            let _b = l.read().unwrap();
        }
    });

    let l = lock.clone();
    let r = running.clone();
    let t1 = std::thread::spawn(move || {
        while r.load(Ordering::SeqCst) {
            let _a = l.write().unwrap();
        }
    });

    std::thread::sleep(std::time::Duration::from_millis(1000));

    running.store(false, Ordering::SeqCst);
    t0.join().unwrap();
    t1.join().unwrap();
}

Stack traces of the deadlock:

#0  0x00007f152040ce9d in syscall () from /usr/lib/libc.so.6
#1  0x0000561815ecab20 in parking_lot_core::thread_parker::imp::ThreadParker::futex_wait (self=0x7f151fd90660, ts=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/thread_parker/linux.rs:111
#2  0x0000561815eca84a in <parking_lot_core::thread_parker::imp::ThreadParker as parking_lot_core::thread_parker::ThreadParkerT>::park (self=0x7f151fd90660)
    at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/thread_parker/linux.rs:65
#3  0x0000561815ec8d3d in parking_lot_core::parking_lot::park::{{closure}} (thread_data=0x7f151fd90640) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/parking_lot.rs:611
#4  0x0000561815ec89b4 in parking_lot_core::parking_lot::with_thread_data (f=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/parking_lot.rs:183
#5  parking_lot_core::parking_lot::park (key=94661465940480, validate=..., before_sleep=..., timed_out=..., park_token=..., timeout=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/parking_lot.rs:576
#6  0x0000561815ec1341 in parking_lot::raw_rwlock::RawRwLock::lock_common (self=0x5618170d2200, timeout=..., token=..., try_lock=..., validate_flags=8)
    at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/src/raw_rwlock.rs:1103
#7  0x0000561815ec00f4 in parking_lot::raw_rwlock::RawRwLock::lock_shared_slow (self=0x5618170d2200, recursive=false, timeout=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/src/raw_rwlock.rs:707
#8  0x0000561815ebaf52 in <parking_lot::raw_rwlock::RawRwLock as lock_api::rwlock::RawRwLock>::lock_shared (self=0x5618170d2200) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/src/raw_rwlock.rs:109
#9  0x0000561815ebb6d6 in lock_api::rwlock::RwLock<R,T>::read (self=0x5618170d2200) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/lock_api/src/rwlock.rs:337
#10 0x0000561815ebf8e1 in tst2::main::{{closure}} () at src/main.rs:14

#0  0x00007f152040ce9d in syscall () from /usr/lib/libc.so.6
#1  0x0000561815ecab20 in parking_lot_core::thread_parker::imp::ThreadParker::futex_wait (self=0x7f151fb8f660, ts=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/thread_parker/linux.rs:111
#2  0x0000561815eca84a in <parking_lot_core::thread_parker::imp::ThreadParker as parking_lot_core::thread_parker::ThreadParkerT>::park (self=0x7f151fb8f660)
    at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/thread_parker/linux.rs:65
#3  0x0000561815ec9d99 in parking_lot_core::parking_lot::park::{{closure}} (thread_data=0x7f151fb8f640) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/parking_lot.rs:611
#4  0x0000561815ec87ff in parking_lot_core::parking_lot::with_thread_data (f=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/parking_lot.rs:183
#5  parking_lot_core::parking_lot::park (key=94661465940481, validate=..., before_sleep=..., timed_out=..., park_token=..., timeout=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/core/src/parking_lot.rs:576
#6  0x0000561815ec0b3a in parking_lot::raw_rwlock::RawRwLock::wait_for_readers (self=0x5618170d2200, timeout=..., prev_value=0) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/src/raw_rwlock.rs:1001
#7  0x0000561815ebfe3d in parking_lot::raw_rwlock::RawRwLock::lock_exclusive_slow (self=0x5618170d2200, timeout=...) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/src/raw_rwlock.rs:632
#8  0x0000561815ebb09d in <parking_lot::raw_rwlock::RawRwLock as lock_api::rwlock::RawRwLock>::lock_exclusive (self=0x5618170d2200) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/src/raw_rwlock.rs:73
#9  0x0000561815ebb706 in lock_api::rwlock::RwLock<R,T>::write (self=0x5618170d2200) at /home/kou/.cargo/git/checkouts/parking_lot-10afde9c7055db59/fa294cd/lock_api/src/rwlock.rs:369
#10 0x0000561815ebf9c2 in tst2::main::{{closure}} () at src/main.rs:22

Originally I was investigating why my upgradable rwlocks deadlock, and I wanted to create an issue with the following snippet which also deadlocks:

use parking_lot::lock_api::RwLockUpgradableReadGuard;
use parking_lot::RwLock;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;

fn main() {
    let lock = Arc::new(RwLock::new(()));
    let running = Arc::new(AtomicBool::new(true));

    let l = lock.clone();
    let r = running.clone();
    let t0 = std::thread::spawn(move || {
        while r.load(Ordering::SeqCst) {
            let _a = l.read();
            let _b = l.read();
        }
    });

    let l = lock.clone();
    let r = running.clone();
    let t1 = std::thread::spawn(move || {
        while r.load(Ordering::SeqCst) {
            let a = l.upgradable_read();
            let _b = RwLockUpgradableReadGuard::upgrade(a);
        }
    });

    std::thread::sleep(std::time::Duration::from_millis(1000));

    running.store(false, Ordering::SeqCst);
    t0.join().unwrap();
    t1.join().unwrap();
}

Amanieu commented 4 years ago

This is normal: you are not allowed to acquire multiple read locks from the same thread, otherwise you may deadlock (this is specified in the docs). This happens because the rwlock will block readers if there is a pending writer, which is necessary to prevent writer starvation.

If you don't care about writer starvation then you can use the read_recursive function instead of read, which should avoid the deadlock in your code.

koute commented 4 years ago

Fair enough; however, I think this behavior is very surprising since:

The RwLock from std doesn't work this way. (On Linux.)
The deadlock is not guaranteed, so even though your code is incorrect everything might seemingly still work.
I'm not sure how the deadlock detector is supposed to work, but from what I can see it doesn't detect this issue?

This makes this a very easy trap to fall into for someone who switches from std's RwLock to parking_lot's RwLock thinking it's a drop-in replacement.

Perhaps we could rename the read_recursive to read, and read to read_non_recursive? I fell like the name should more explicitly say that it can deadlock.

Amanieu commented 4 years ago

The RwLock from std doesn't work this way on Linux (pthread_mutex_t) but does on Windows (SRWLock). This means that your code will work on Linux but will deadlock on Windows.
Correct, but there isn't anything we can do about that.
The deadlock detector should detect this issue, but only if a deadlock actually happens (i.e. it doesn't check lock ordering).

I disagree about making read_recursive the default. Preventing writer starvation is a very useful property and is what you want most of the time. Recursive read locks has always been undefined behavior on many platforms, I see no reason to change this (except we deadlock instead of UB, which is safe).

koute commented 4 years ago

I disagree about making read_recursive the default. Preventing writer starvation is a very useful property and is what you want most of the time. Recursive read locks has always been undefined behavior on many platforms, I see no reason to change this (except we deadlock instead of UB, which is safe).

Would you be then also against getting rid of the default read altogether and only having explicitly named read_recursive and read_non_recursive?

DexterHaxxor commented 4 years ago

That would be breaking the "drop-in replacement" promise that this crate is trying to achieve.

Amanieu commented 4 years ago

The standard library makes no guarantees that recursive read locks won't deadlock. It is still a drop in replacement, the problem is that your code is relying on unspecified behavior.

terrence2 commented 3 years ago

Okay, so let's say for the sake of argument that a friend of mine (maybe actually me) -- who has been doing multithreaded systems programming on linux for >20 years -- assumed (the dumb idiot) that they know everything and didn't bother reading the fine print when they dropped in parking_lot. Then, they went and wrote 150,000+ lines of code (https://gitlab.com/terrence_too/openfa/) that deeply depends on RwLocks and doesn't defend against recursive reads. And are now somewhat annoyed that their bulletproof Rust code (fearless concurrency!) is suffering from random deadlocks like it's 1998 all over again.

Is there a way that this poor, poor fool could add some panics or even printlns to parking_lot's RwLock implementation to catch recursive locking reliably at runtime instead of randomly? Ideally this user would be able to run their extensive test suite once to catch all the improper usage, instead of waiting for undebuggable bug reports to trickle in from angry users over the next 20 years.

It seems like it would make for a perfectly reasonable non-default feature flag, at the very least. I'm going to write the code for myself (my friend, I mean) anyway; would there be interest in taking it as a contribution? Any pointers on where might be a good place to detect the recursion?

bjorn3 commented 3 years ago

Does the deadlock_detection feature flag detect this case?

terrence2 commented 3 years ago

Thanks for pointing me at the deadlock_detection feature! I swear I read the entire readme at some point, but apparently not with enough attention.

I added the deadlock_detection feature to all instances of parking_lot in the project, reintroduced the deadlock I know about, then did a full cargo clean before re-running. I was able to reproduce the deadlock on my first try in a release build. I have not yet been able to reproduce in debug. Nary a peep from the deadlock detector, even when the deadlock actually occurred. Is there any environment variable or something else that I need to enable?

Amanieu commented 3 years ago

The way to use the deadlock detector (I realize this isn't actually documented, this should be fixed...) is to create a thread that loops between calling parking_lot_core::deadlock::check_deadlock and sleeping. If a deadlock has occurred then this function will return a list of threads that are stuck on a deadlock and backtraces for each thread.

However this will only catch a deadlock at runtime after it has actually happened, so this is probably not the solution you are looking for.

One thing you could try is to replace all RwLock with Mutex, which should force a deadlock for any case where you have a recursive read lock.

terrence2 commented 3 years ago

Thanks for the guidance on the deadlock detector! Does that mean that it doesn't have any performance overhead if I'm not running the background thread? Seems like it would be a really helpful tool to have available as a flag.

:thinking: Substituting Mutex is a really clever idea! A ton of work and I can't use it everywhere for performance reasons, but it would make the bits I need to audit tractable, if I can figure out how to cleanly separate those parts. Thanks for the idea!

Amanieu commented 3 years ago

Unfortunately it does have overhead, which is why it is disabled by default.

samdenty commented 1 year ago

Is it possible to implement an option to catch two readers existing at the same time and panic, without deadlock detection?

Deadlock detection will only catch the problem after it's occured.

We should have a feature to catch bad usage before it occurs

bjorn3 commented 1 year ago

I believe the problem only occurs if another read lock is done on the same thread while in the mean time another thread tries to do a write lock. If the write lock is missing or happens after the second read lock is done, no problem happens. You may have another mechanism that prevents the write lock from occuring before the second read lock, in which case panicking on the second read lock shouldn't happen IMHO.

samdenty commented 1 year ago

yeah it's a race condition with a writer in the middle of two readers. Are you saying we can fix this in the library or in user-land?

You may have another mechanism that prevents the write lock from occurring before the second read lock

How would an implementation go about - without leading to writer starvation? We worked around this limitation by merging the 2 readers into 1. I was hoping adding panicking would allow me to find the other problematic places where 2 readers exist concurrently on the same thread - maybe under a new feature for debugging similar to deadlock_detection?

diagram of what's happening in our case

![Diagram](https://user-images.githubusercontent.com/13242392/231287375-c16024d3-1bdc-4d9f-8ba0-ab11dad5b921.jpg)

samdenty commented 1 year ago

Additionally there's https://github.com/BurtonQin/lockbud for finding these kind of issues (works with parking_lot). I didn't manage to get it working in my case, hence my need for runtime panicking

bjorn3 commented 1 year ago

You may for example be using exclusively try_write on the writer side because you would write the value somewhere else if the lock couldn't be acquired or you may only attempt to do a read lock if you know the writer thread is currently blocked on something else or not running at all. These are cases where you as programmer know that it is fine to have two read locks on the same thread, but parking_lot doesn't and as such would have to panic under your proposal.

Amanieu commented 1 year ago

You could implement this yourself as a wrapper around RwLock that keeps a thread-local HashSet of all the read locks owned by the current thread. That would allow you to panic if you detect an attempt to acquire a lock that is already owned.

Amanieu / parking_lot

A write rwlock deadlocks when there are two shared read locks present on another thread #212