Question about a performance issue

emwalker commented 4 months ago

We are hoping to investigate the relative performance of a Rust/Magnus version of a cache class that is used in our runtime. For a ~ 7s pure ruby benchmark, I am seeing a 1-3s degredation in run times with Rust (i.e., 8-10s).

cache.rs

```rust use fxhash::FxBuildHasher; use indexmap::IndexMap; use magnus::{ exception, function, method, value::{BoxValue, ReprValue}, Error, Module, Object, RArray, RModule, Ruby, Value, }; use std::cell::RefCell; type Index = IndexMap; // Class owned by Ruby code, external to Magnus type Handle = BoxValue; #[magnus::wrap(class = "Mutator")] struct Mutator { inner: Handle, } // SAFETY: Probably ok because only seen in Ruby thread? // Opaque> seems to require that Magnus own the class being held in the cache in order to be able // to call `val.funcall(...)` unsafe impl Send for Mutator {} // SAFETY: Mutator wraps its CRuby type in a BoxValue unsafe impl magnus::IntoValueFromNative for Mutator {} impl Mutator { fn init(ruby: &Ruby, module: &RModule) -> Result<(), Error> { let _class = module.define_class("Mutator", ruby.class_object())?; Ok(()) } } impl Clone for Mutator { fn clone(&self) -> Self { Self { inner: BoxValue::new(self.inner.as_value()), } } } impl TryFrom for Mutator { type Error = Error; fn try_from(value: Value) -> Result { Ok(Self { inner: BoxValue::new(value), }) } } impl From<&Mutator> for Handle { fn from(mutator: &Mutator) -> Self { BoxValue::new(mutator.inner.as_value()) } } struct IndexData { field_1: String, field_2: String, } impl TryFrom for IndexData { type Error = Error; fn try_from(mutator: Mutator) -> Result { let obj: Value = *mutator.inner; let field_1: String = obj.funcall("field_1", ())?; let field_2: String = obj.funcall("field_2", ())?; Ok(Self { field_2, field_1 }) } } struct Inner { by_field_1: Index>, by_field_2: Index>, finalized: bool, find_by_field_1: Index, find_by_field_2: Index<(String, String), Mutator>, } impl Inner { fn with_capacity(n: usize) -> Self { let hasher = FxBuildHasher::default(); Self { by_field_1: Index::with_capacity_and_hasher(n, hasher.clone()), by_field_2: Index::with_capacity_and_hasher(n, hasher.clone()), finalized: false, find_by_field_1: Index::with_capacity_and_hasher(n, hasher.clone()), find_by_field_2: Index::with_capacity_and_hasher(n, hasher), } } fn add(&mut self, value: Value) -> Result<(), Error> { let mutator: Mutator = Mutator::try_from(value)?; let data: IndexData = mutator.clone().try_into()?; self.by_field_1 .entry(data.field_1.clone()) .or_default() .push(mutator.clone()); self.by_field_2 .entry(data.field_2) .or_default() .push(mutator.clone()); self.find_by_field_1.insert(data.field_1, mutator.clone()); self.find_by_field_2.insert( (data.field_2.clone(), data.field_3.clone()), mutator.clone(), ); Ok(()) } fn len(&self) -> usize { self.find_by_field_1.len() } fn by_field_1(&self, field_1: String) -> Vec { self.by_field_1 .get(&field_1) .map(|vec| vec.iter().map(Handle::from).collect()) .unwrap_or_default() } fn by_field_2(&self, field_2: String) -> Vec { self.by_field_2 .get(&field_2) .map(|vec| vec.iter().map(Handle::from).collect()) .unwrap_or_default() } fn finalize(&mut self) { self.finalized = true; } fn find_by_field_1(&self, field_1: String) -> Option { self.find_by_field_1.get(&field_1).map(Handle::from) } fn find_by_field_2(&self, field_2: String, field_3: String) -> Option { self.find_by_field_2 .get(&(field_2, field_3)) .map(Handle::from) } fn is_empty(&self) -> bool { self.find_by_field_1.is_empty() } fn remove(&mut self, mutator: Value) -> Result<(), Error> { let mutator: Mutator = mutator.try_into()?; let data: IndexData = mutator.clone().try_into()?; self.by_field_2.shift_remove(&data.field_2); self.by_field_1.shift_remove(&data.field_1); self.find_by_field_1.shift_remove(&data.field_1); self.find_by_field_2 .shift_remove(&(data.field_2, data.field_3)); Ok(()) } fn to_a(&self) -> Vec { self.find_by_field_1.values().map(Handle::from).collect() } } #[magnus::wrap(class = "Cache")] struct Cache(RefCell); impl Cache { fn init(ruby: &Ruby, module: &RModule) -> Result<(), Error> { let class = module.define_class("Cache", ruby.class_object())?; class.define_singleton_method("build", function!(Cache::build, 1))?; class.define_method("rust_add", method!(Cache::rust_add, 1))?; class.define_method("empty?", method!(Cache::is_empty, 0))?; class.define_method("finalize", method!(Cache::finalize, 0))?; class.define_method("present?", method!(Cache::is_present, 0))?; class.define_method("remove", method!(Cache::remove, 1))?; class.define_method("rust_by_field_1", method!(Cache::rust_by_field_1, 1))?; class.define_method("rust_by_field_2", method!(Cache::rust_by_field_2, 1))?; class.define_method( "rust_find_by_field_1", method!(Cache::rust_find_by_field_2, 2), )?; class.define_method( "rust_find_by_field_2", method!(Cache::rust_find_by_field_2, 2), )?; class.define_method("size", method!(Cache::size, 0))?; class.define_method("to_a", method!(Cache::to_a, 0))?; Ok(()) } fn build(items: RArray) -> Result { let mut cache = Inner::with_capacity(items.len() + 100); for value in items.into_iter() { cache.add(value)?; } Ok(Self(RefCell::new(cache))) } fn rust_add(&self, value: Value) -> Result<(), Error> { self.check_finalized()?; self.0.borrow_mut().add(value)?; Ok(()) } fn check_finalized(&self) -> Result<(), Error> { if self.0.borrow().finalized { return Err(Error::new( exception::frozen_error(), "attempt to modify frozen cache", )); } Ok(()) } fn finalize(&self) { self.0.borrow_mut().finalize() } fn is_empty(&self) -> bool { self.0.borrow().is_empty() } fn is_present(&self) -> bool { !self.is_empty() } fn remove(&self, mutator: Value) -> Result<(), Error> { self.check_finalized()?; self.0.borrow_mut().remove(mutator) } fn rust_by_field_1(&self, field_1: String) -> Vec { self.0.borrow().by_field_1(field_1) } fn rust_by_field_2(&self, field_2: String) -> Vec { self.0.borrow().by_field_2(field_2) } fn rust_find_by_field_2(&self, field_2: String, field_3: Option) -> Option { self.0.borrow().find_by_field_2(field_2, field_3.into()) } fn size(&self) -> usize { self.0.borrow().len() } fn to_a(&self) -> Vec { self.0.borrow().to_a() } } // Called by another module pub fn init(ruby: &Ruby, module: &RModule) -> Result<(), Error> { Mutator::init(ruby, module)?; Cache::init(ruby, module)?; Ok(()) } ```

I'm pretty sure I'm doing something wrong here. In addition to adding 1-3s to the benchmark, there is a prolonged pause afterwards along with a high level of CPU activity, sometimes holding up the exit of the script for many seconds. Perhaps this is GC activity?

Just hoping something will stand out. Thanks in advance if you notice anything.

matsadler commented 4 months ago

All Ruby types in Magnus (Value, RString, etc) are effectively pointers to where Ruby is actually storing the data (this is how Ruby's C api works, you only ever get pointers, Ruby holds on to the data). So you can copy them (as in pass by value) with no performance penalty. This is why all methods on Ruby objects in Magnus take an owned self rather than a &self reference. Usually Rust will do the copy for you when calling a method, even if you've got a reference to a value.

BoxValue is a pointer to a Ruby Value on the allocated on the heap. So if a normal value is a pointer to the real data then BoxValue is a pointer to a pointer to the real data. Normally Ruby's GC can't see values on the heap, so when BoxValue is created it calls a method from Ruby's C API to let Ruby know that the thing it's pointing to is a Ruby value. And then when BoxValue is dropped it calls Ruby to say the memory it was pointing as is no longer a Ruby value. All this adds up to making BoxValue very inefficient. It's expensive to create because of the allocation and having to call Ruby's C API. It's expensive to call methods on the value, because of the extra pointer indirection. And it's expensive to drop because of the call to Ruby. It's really only meant for occasional one off use where for some reason you really really need to put a Ruby value on the heap for a short amount of time.

I've adjusted your example to remove use of BoxValue and Mutator: (I had to adjust a few other things to get your example compiling)

cache.rs

```rust use fxhash::FxBuildHasher; use indexmap::IndexMap; use magnus::{ exception, function, gc, method, value::{Opaque, ReprValue}, DataTypeFunctions, Error, Module, Object, RArray, RModule, RString, Ruby, TypedData, Value, }; use std::cell::RefCell; type Index = IndexMap; struct Inner { by_field_1: Index>>, by_field_2: Index>>, finalized: bool, find_by_field_1: Index>, find_by_field_2: Index<(String, String), Opaque>, } impl Inner { fn with_capacity(n: usize) -> Self { let hasher = FxBuildHasher::default(); Self { by_field_1: Index::with_capacity_and_hasher(n, hasher.clone()), by_field_2: Index::with_capacity_and_hasher(n, hasher.clone()), finalized: false, find_by_field_1: Index::with_capacity_and_hasher(n, hasher.clone()), find_by_field_2: Index::with_capacity_and_hasher(n, hasher), } } fn add(&mut self, value: Value) -> Result<(), Error> { let field_1: String = value.funcall("field_1", ())?; let field_2: String = value.funcall("field_2", ())?; let field_3: String = value.funcall("field_3", ())?; self.by_field_1 .entry(field_1.clone()) .or_default() .push(value.into()); self.by_field_2 .entry(field_2.clone()) .or_default() .push(value.into()); self.find_by_field_1.insert(field_1, value.into()); self.find_by_field_2 .insert((field_2, field_3), value.into()); Ok(()) } fn len(&self) -> usize { self.find_by_field_1.len() } fn by_field_1(&self, ruby: &Ruby, field_1: RString) -> Result { Ok(ruby.ary_from_iter( unsafe { self.by_field_1.get(field_1.as_str()?) } .cloned() .unwrap_or_default(), )) } fn by_field_2(&self, ruby: &Ruby, field_2: RString) -> Result { Ok(ruby.ary_from_iter( unsafe { self.by_field_2.get(field_2.as_str()?) } .cloned() .unwrap_or_default(), )) } fn finalize(&mut self) { self.finalized = true; } fn find_by_field_1(&self, field_1: RString) -> Result>, Error> { Ok(unsafe { self.find_by_field_1.get(field_1.as_str()?) }.copied()) } fn find_by_field_2(&self, field_2: String, field_3: String) -> Option> { self.find_by_field_2.get(&(field_2, field_3)).copied() } fn is_empty(&self) -> bool { self.find_by_field_1.is_empty() } fn remove(&mut self, value: Value) -> Result<(), Error> { let field_1: RString = value.funcall("field_1", ())?; let field_2: String = value.funcall("field_2", ())?; let field_3: String = value.funcall("field_3", ())?; unsafe { let s = field_1.as_str()?; self.by_field_1.shift_remove(s); self.find_by_field_1.shift_remove(s); } self.by_field_2.shift_remove(&field_2); self.find_by_field_2.shift_remove(&(field_2, field_3)); Ok(()) } fn to_a(&self) -> RArray { self.find_by_field_1.values().cloned().collect() } } #[derive(TypedData)] #[magnus(class = "Example::Cache", free_immediately, mark)] struct Cache(RefCell); impl DataTypeFunctions for Cache { fn mark(&self, marker: &gc::Marker) { // the fields `by_field_1` etc should all contain the same Ruby values // so marking just the first should get them all for val in self.0.borrow().by_field_1.values().flatten() { marker.mark(*val); } } } impl Cache { fn init(ruby: &Ruby, module: &RModule) -> Result<(), Error> { let class = module.define_class("Cache", ruby.class_object())?; class.define_singleton_method("build", function!(Cache::build, 1))?; class.define_method("rust_add", method!(Cache::rust_add, 1))?; class.define_method("empty?", method!(Cache::is_empty, 0))?; class.define_method("finalize", method!(Cache::finalize, 0))?; class.define_method("present?", method!(Cache::is_present, 0))?; class.define_method("remove", method!(Cache::remove, 1))?; class.define_method("rust_by_field_1", method!(Cache::rust_by_field_1, 1))?; class.define_method("rust_by_field_2", method!(Cache::rust_by_field_2, 1))?; class.define_method( "rust_find_by_field_1", method!(Cache::rust_find_by_field_1, 1), )?; class.define_method( "rust_find_by_field_2", method!(Cache::rust_find_by_field_2, 2), )?; class.define_method("size", method!(Cache::size, 0))?; class.define_method("to_a", method!(Cache::to_a, 0))?; Ok(()) } fn build(items: RArray) -> Result { let mut cache = Inner::with_capacity(items.len() + 100); for value in items.into_iter() { cache.add(value)?; } Ok(Self(RefCell::new(cache))) } fn rust_add(&self, value: Value) -> Result<(), Error> { self.check_finalized()?; self.0.borrow_mut().add(value)?; Ok(()) } fn check_finalized(&self) -> Result<(), Error> { if self.0.borrow().finalized { return Err(Error::new( exception::frozen_error(), "attempt to modify frozen cache", )); } Ok(()) } fn finalize(&self) { self.0.borrow_mut().finalize() } fn is_empty(&self) -> bool { self.0.borrow().is_empty() } fn is_present(&self) -> bool { !self.is_empty() } fn remove(&self, value: Value) -> Result<(), Error> { self.check_finalized()?; self.0.borrow_mut().remove(value) } fn rust_by_field_1(ruby: &Ruby, this: &Self, field_1: RString) -> Result { this.0.borrow().by_field_1(ruby, field_1) } fn rust_by_field_2(ruby: &Ruby, this: &Self, field_2: RString) -> Result { this.0.borrow().by_field_2(ruby, field_2) } fn rust_find_by_field_1(&self, field_1: RString) -> Result>, Error> { self.0.borrow().find_by_field_1(field_1) } fn rust_find_by_field_2( &self, field_2: String, field_3: Option, ) -> Option> { self.0 .borrow() .find_by_field_2(field_2, field_3.unwrap_or(String::from("default"))) } fn size(&self) -> usize { self.0.borrow().len() } fn to_a(&self) -> RArray { self.0.borrow().to_a() } } #[magnus::init] fn init(ruby: &Ruby) -> Result<(), Error> { let module = ruby.define_module("Example")?; Cache::init(ruby, &module)?; Ok(()) } ```

Converting Ruby strings to Rust strings is also moderately expensive. Ruby strings can be in many encodings, Rust strings are always UTF-8, so the conversion has to do at minimum a UTF-8 validity check. It will also have to copy the string out from where Ruby is storing it to memory that Rust owns. In my example above I tweaked things a little to pass strings as Ruby strings, and get &str references out of the Ruby strings where possible.

There's probably some other places to win a little performance, but these were the obvious ones.

Even with these changes I'd be surprised if it's much faster than a well written pure Ruby version. The performance is largely going to be down to the underlying data structures. Ruby's built in data structures (Array, Hash) are written in C and have had a couple of decades of optimisation to be really really good at storing Ruby objects.

Ruby's Hash preserves insertion order. I don't know how much of a performance penalty that is, there might be some cases where Rust's HashMap can store and retrieve Ruby objects faster, but as you're using IndexMap here I don't know if you can give up preserving insertion order for your use case.

emwalker commented 4 months ago

I am grateful for the helpful discussion. It sounds like BoxValue is an escape hatch. I will get your changes working and see what the benchmark looks like.

I was having a hard time using Opaque<Value>. I think I was trying to use Opaque<Obj<Value>>. Magnus wanted the value to implement TypedData. It seems from your update of my code that that won't be needed after all.

(I had to adjust a few other things to get your example compiling)

My apologies. I attempted to strike a balance between giving something nontrivial that might disclose some subtle issue, on one hand, and putting company code up on the internet, on the other.

Even with these changes I'd be surprised if it's much faster than a well written pure Ruby version.

This is what people have been telling me. I've been guardedly optimistic that we can get some significant gains by using Rust. But if that is possible, it's probably going to be tricky (still curious to try).

emwalker commented 4 months ago

Seeing an occasional BorrowError. I just remembered that we have some recursive calls on the Ruby side, some of which will mutate the cache, and Ruby's shared mutable access probably isn't playing well with the Cache(RefCell:new(inner)) pattern in this case. And it's possible that a GC pass might happen when calling code has a mutable borrow, causing a panic when the DataTypeFunctions::mark method attempts to obtain an immutable reference to inner. I'll have to think about this.

emwalker commented 4 months ago

After consulting on the BorrowErrors, we tentatively thought that this approach might be ok:

impl DataTypeFunctions for Cache {
    fn mark(&self, marker: &gc::Marker) {
        let inner = self.0.as_ptr();

        // SAFETY: We're on the Ruby thread, and Ruby stops the world during a
        // GC pass, and we're making Ruby-specific changes, so there is no risk
        // of a data race with other Rust code.
        unsafe {
            for val in (*inner).find_by_id.values() {
                marker.mark(*val);
            }
        }
    }
}

matsadler commented 3 months ago

I think that'll probably work, because you're right, the code that would be trying to mutate that value couldn't be running at that time, there's no danger of reading memory that's actively being written to. However, my understanding is that it's breaking the rules that the Rust compiler is expecting you to uphold, and so is not sound, and Rust could potentially break your code with future optimisations.

I believe the compiler is allowed to assume that the mere existence of an &mut reference means nothing else will read or write that data, regardless of whether or not the &mut reference is actively being used. This means a sufficiently smart compiler could optimise away your mark function when that &mut reference exists, because according to that rule there's no way that mark could be called while the &mut reference exists.

Changing the other borrow/borrow_muts to also use as_ptr would let the compiler know that it can't assume the references are unique, so remove the theoretical danger of optimisations messing with your code. But then you'd be using raw pointers everywhere which is very error prone, and there's no point using RefCell (but maybe I should add an api to Magnus to get the raw pointer out of a wrapped object for cases where that really is the best option).

One potential cause of the BorrowError is the way this code is structured:

impl Inner {
    fn add(&mut self, value: Value) -> Result<(), Error> {
        let field_1: String = value.funcall("field_1", ())?;
        let field_2: String = value.funcall("field_2", ())?;
        let field_3: String = value.funcall("field_3", ())?;

        self.by_field_1
            .entry(field_1.clone())
            .or_default()
            .push(value.into());
        self.by_field_2
            .entry(field_2.clone())
            .or_default()
            .push(value.into());

        self.find_by_field_1.insert(field_1, value.into());
        self.find_by_field_2
            .insert((field_2, field_3), value.into());

        Ok(())
    }
}

impl Cache {
    fn rust_add(&self, value: Value) -> Result<(), Error> {
        self.check_finalized()?;
        self.0.borrow_mut().add(value)?;
        Ok(())
    }
}

The RefCell containing Inner is borrowed, and then there are calls out to Ruby methods. Any call to Ruby could potentially trigger the GC, which could then call your mark function, which then tries to do a borrow, and then it all goes wrong.

If you did this instead:

impl Inner {
    fn add(&mut self, field_1: String, field_2: String, field_3: String) -> Result<(), Error> {
        self.by_field_1
            .entry(field_1.clone())
            .or_default()
            .push(value.into());
        self.by_field_2
            .entry(field_2.clone())
            .or_default()
            .push(value.into());

        self.find_by_field_1.insert(field_1, value.into());
        self.find_by_field_2
            .insert((field_2, field_3), value.into());

        Ok(())
    }
}

impl Cache {
    fn rust_add(&self, value: Value) -> Result<(), Error> {
        self.check_finalized()?;
        let field_1: String = value.funcall("field_1", ())?;
        let field_2: String = value.funcall("field_2", ())?;
        let field_3: String = value.funcall("field_3", ())?;
        self.0.borrow_mut().add(field_1, field_2, field_3)?;
        Ok(())
    }
}

Then there's no chance of triggering the GC while the RefCell is borrowed.

emwalker commented 3 months ago

Very helpful discussion. It makes sense that the Rust compiler might proceed to optimize the mark method away at some point, given that the unsafe code in my example might overlap with a mutable borrow. I'll also avoid calling out to Ruby in the interior of a RefCell to avoid triggering a GC pass.

matsadler / magnus

Question about a performance issue #116