zama-ai / tfhe-rs

TFHE-rs: A Pure Rust implementation of the TFHE Scheme for Boolean and Integer Arithmetics Over Encrypted Data.
Other
920 stars 143 forks source link

BorrowMutError: 'already borrowed' while executing CompactFheBool operations within a rayon parallel iterator #993

Open 0xalexbel opened 7 months ago

0xalexbel commented 7 months ago

Unable to execute CompactFheBool operations within a rayon iterator A BorrowMutError is randomly raised when executing a CompactFheBool operation inside a rayon parallel iterator. The bug arises when trying to access the local thread RefCell that encapsulates the ShortEngine. The bug arises in Release AND Debug. The problem does not occur with FheBool or CompressedFheBool.

To Reproduce

  1. Copy/paste the code below
  2. Adjust the number of iterations in the main loop (100 is the default, may not be enough)
  3. Use any of the following Cargo.toml files below

The main.rs file

use tfhe::set_server_key;
use tfhe::{
    prelude::FheEncrypt, CompactFheBool, CompactPublicKey,
};

use rayon::iter::IntoParallelRefIterator;
use rayon::iter::ParallelIterator;

fn main() {
    // Panic at tfhe-0.5.3/src/shortint/engine/mod.rs:216:63
    // already borrowed: BorrowMutError
    let config = tfhe::ConfigBuilder::default().build();
    let (ck, sk) = tfhe::generate_keys(config);

    let pool_sk = sk.clone();
    rayon::broadcast(move |_| {
        let thread_local_sk = pool_sk.clone();
        set_server_key(thread_local_sk);
    });
    set_server_key(sk);

    let compact_public_key = CompactPublicKey::try_new(&ck).unwrap();

    // You may have to increase the number of iterations to reach to the problem.
    // 100 is enough on my machine (which is not very powerfull)
    let v = vec![true; 100];
    v.par_iter().for_each(|x| {
        let _a_compact_fhe_bool = CompactFheBool::encrypt(*x, &compact_public_key);
    });
}

The Cargo.toml debug file

[package]
name = "tfhe_bug"
version = "0.1.0"
edition = "2021"

[dependencies]
tfhe = { version = "0.5.3", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
rayon = { version = "1.8.1" }

The Cargo.toml release file

[package]
name = "tfhe_bug"
version = "0.1.0"
edition = "2021"

[dependencies]
tfhe = { version = "0.5.3", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
rayon = { version = "1.8.1" }

[profile.dev.package."*"]
opt-level = 3
debug = false
split-debuginfo = '...'  # Platform-specific.
strip = "none"
debug-assertions = false
overflow-checks = false
incremental = false
codegen-units = 16

Logs

stack backtrace:
   0: rust_begin_unwind
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:645:5
   1: core::panicking::panic_fmt
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/panicking.rs:72:14
   2: core::cell::panic_already_borrowed
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/cell.rs:761:5
   3: core::cell::RefCell<T>::borrow_mut
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/cell.rs:1051:25
   4: tfhe::shortint::engine::ShortintEngine::with_thread_local_mut::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/engine/mod.rs:216:51
   5: std::thread::local::LocalKey<T>::try_with
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/thread/local.rs:270:16
   6: std::thread::local::LocalKey<T>::with
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/thread/local.rs:246:9
   7: tfhe::shortint::engine::ShortintEngine::with_thread_local_mut
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/engine/mod.rs:216:9
thread '<unnamed>' panicked at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/engine/mod.rs:216:63:
already borrowed: BorrowMutError
   8: tfhe::shortint::public_key::compact::CompactPublicKey::encrypt_iter
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/public_key/compact.rs:240:13
   9: tfhe::integer::public_key::compact::CompactPublicKey::encrypt_radix_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/integer/public_key/compact.rs:74:23
  10: tfhe::integer::public_key::compact::CompactPublicKey::encrypt_iter_radix_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/integer/public_key/compact.rs:100:24
  11: tfhe::integer::public_key::compact::CompactPublicKey::encrypt_slice_radix_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/integer/public_key/compact.rs:86:9
  12: tfhe::high_level_api::keys::inner::IntegerCompactPublicKey::try_encrypt_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/high_level_api/keys/inner.rs:228:9
  13: <tfhe::high_level_api::booleans::compact::CompactFheBool as tfhe::high_level_api::traits::FheTryEncrypt<bool,tfhe::high_level_api::keys::public::CompactPublicKey>>::try_encrypt
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/high_level_api/booleans/compact.rs:59:26
  14: <T as tfhe::high_level_api::traits::FheEncrypt<Clear,Key>>::encrypt
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/high_level_api/traits.rs:21:9
  15: tfhe_bug::main::{{closure}}
             at ./src/main.rs:26:35
  16: core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ops/function.rs:272:13
  17: <core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::for_each
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/slice/iter/macros.rs:254:21
  18: <rayon::iter::for_each::ForEachConsumer<F> as rayon::iter::plumbing::Folder<T>>::consume_iter
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/for_each.rs:55:9
  19: rayon::iter::plumbing::Producer::fold_with
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/plumbing/mod.rs:110:9
  20: rayon::iter::plumbing::bridge_producer_consumer::helper
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/plumbing/mod.rs:438:13
  21: rayon::iter::plumbing::bridge_producer_consumer::helper::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/plumbing/mod.rs:427:21
  22: rayon_core::join::join_context::call_b::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/join/mod.rs:129:25
  23: rayon_core::job::JobResult<T>::call::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:218:41
  24: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/panic/unwind_safe.rs:272:9
  25: std::panicking::try::do_call
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
  26: ___rust_try

Configuration:

tmontaigu commented 7 months ago

Hello, thanks for the detailled report, we are going to investigate and see what we can do about it

Sadly for now I don't have any workaround other than not using par_iter when encrypting Compact ciphertexts

IceTDrinker commented 2 months ago

there is a way to fix this potentially with a Mutex instead of a ref cell, not sure it's gonna be great and not the source of deadlocks, so will have to test and make sure we understand what rayon does with tasks

IceTDrinker commented 2 months ago

but a Mutex essentially defeats the thread local storage so, not great

IceTDrinker commented 2 months ago

and it deadlocks of course, it's the well known rayon bug from here https://github.com/rayon-rs/rayon/issues/592

IceTDrinker commented 2 months ago

using this issue as a bit of a notepad on that issue

the problem arises when there are nested rayon calls IIRC, as the recent examples/addition proposal in the https://github.com/rayon-rs/rayon/issues/592 issue (e.g. https://github.com/rayon-rs/rayon/issues/592#issuecomment-2177270078) for fully blocking thread pool seems to indicate

in our case some threads are stealing some tasks from other threads where the engine has already been borrowed, I'm still unclear on the exact succession of events

could be

could be

IceTDrinker commented 2 months ago

example log

looks to be the first case 🤔

Thread #ThreadId(1), borrow cell: 0x7f40d3d6dc40
Thread #ThreadId(1), stops borrow cell
Thread #ThreadId(1), borrow cell: 0x7f40d3d6dc40
Thread #ThreadId(1), stops borrow cell
Thread #ThreadId(1), borrow cell: 0x7f40d3d6dc40
Thread #ThreadId(1), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(3), stops borrow cell