rust-lang / rust

Empowering everyone to build reliable and efficient software.
https://www.rust-lang.org
Other
98.69k stars 12.75k forks source link

MIR: global performance regression with Vec resize() or extend_from_slice(), since 1.12 #40267

Closed supercurio closed 1 month ago

supercurio commented 7 years ago

A bug introduced with rustc 1.12.0 with MIR enabled. This was originally reported imprecisely as #40044

If anywhere in the code a vector is either resize() or extend_from_slice(), any algorithms operating on any other vector will run slower. The performance impact varies with the compiler targets and CPU characteristics.

Real world examples measured on an audio DSP algorithm

armv7

Raspberry Pi 3, Rustc 1.12.0: 14% slower Raspberry Pi 3, Rustc 1.15.1: 11% slower Nexus 5, Rustc 1.15: 1% slower

arm

Raspberry Pi, Rustc 1.15.1: 8% slower

x86-64

Core i5-750, Rustc 1.12.1 or Rustc 1.15.1: 0.4% slower

When compiling using Rustc 1.12.0 with MIR disabled, the performance regression does not occur.

Test and demo project

https://github.com/supercurio/rust-issue-mir-vec-slowdown

arielb1 commented 7 years ago

Can confirm.

Semi-minified code (not to depend on formatters which generate tons of junk IR - plz fix @eddyb by introducing ):

#![crate_type="rlib"]

/// Audio sample rate for the test set, used for realtime speed
/// calculation
const SAMPLE_RATE: f64 = 48000.0;
/// Total length of samples the filter benchmarks are ran on
const SAMPLE_COUNT: u64 = 524288;
/// Select how many IIR filters should be applied consecutively
/// on each buffer during the benchmark
const FILTER_COUNT: usize = 100;
const BUFFER_LEN: usize = 128;

/// 2nd order biquad filter
#[derive(Copy)]
struct Biquad {
    b0: f64,
    b1: f64,
    b2: f64,
    a1: f64,
    a2: f64,

    x1: f64,
    x2: f64,
    y1: f64,
    y2: f64,
}

impl Clone for Biquad {
    fn clone(&self) -> Biquad {
        *self
    }
}

impl Biquad {
    fn new() -> Biquad {
        Biquad {
            b0: 0.0,
            b1: 0.0,
            b2: 0.0,
            a1: 0.0,
            a2: 0.0,
            x1: 0.0,
            x2: 0.0,
            y1: 0.0,
            y2: 0.0,
        }
    }
}

fn iir(buf: &mut [f64], bq: &mut Biquad) {
    for i in 0..buf.len() {
        let x = buf[i];
        buf[i] = (bq.b0 * x) + (bq.b1 * bq.x1) + (bq.b2 * bq.x2) - (bq.a1 * bq.y1) -
                 (bq.a2 * bq.y2);

        bq.x2 = bq.x1;
        bq.x1 = x;

        bq.y2 = bq.y1;
        bq.y1 = buf[i];
    }
}

#[cfg(slow)]
pub fn foo() {
    println!("Create an empty vector, resized then discarded");
    let mut vec_test: Vec<f64> = Vec::new();
    vec_test.resize(1234, 0.0);
}

#[inline(never)]
pub fn sample(buffer_len: usize) {
    let buffer_count = SAMPLE_COUNT / buffer_len as u64;

    for _ in 0..10 {
        let mut buf = vec![0.0; buffer_len];
        let mut biquads = [Biquad::new(); FILTER_COUNT];
        for _ in 0..buffer_count {
            for f in 0..FILTER_COUNT {
                iir(buf.as_mut_slice(), &mut biquads[f]);
            }
        }
    }
}

The fast inner loop looks like:


bb7.i:                                            ; preds = %bb7.i, %bb22
  %.in87 = phi double [ %.pre14.i, %bb22 ], [ %236, %bb7.i ]
  %232 = phi double [ %.pre.i, %bb22 ], [ %249, %bb7.i ]
  %iter.sroa.0.013.i = phi i64 [ 0, %bb22 ], [ %234, %bb7.i ]
  %233 = phi <2 x double> [ %226, %bb22 ], [ %248, %bb7.i ]
  %234 = add nuw i64 %iter.sroa.0.013.i, 1
  %235 = getelementptr inbounds double, double* %212, i64 %iter.sroa.0.013.i
  %236 = load double, double* %235, align 8
  %237 = fmul double %232, %227
  %238 = fmul double %236, %228
  %239 = fmul double %.in87, %229
  %240 = fadd double %238, %239
  %241 = fmul <2 x double> %233, %231
  %242 = extractelement <2 x double> %241, i32 0
  %243 = fadd double %240, %242
  %244 = extractelement <2 x double> %241, i32 1
  %245 = fsub double %243, %244
  %246 = fsub double %245, %237
  store double %246, double* %235, align 8
  %exitcond.i = icmp eq i64 %234, %local_len.sroa.4.0.lcssa26.i.i
  %247 = insertelement <2 x double> undef, double %.in87, i32 0
  %248 = insertelement <2 x double> %247, double %246, i32 1
  %249 = extractelement <2 x double> %233, i32 1
  br i1 %exitcond.i, label %bb17.loopexit, label %bb7.i
bb17.loopexit:                                    ; preds = %bb7.i
  store double %236, double* %217, align 8
  %216 = add nuw nsw i64 %iter4.sroa.0.080, 1
  store double %.in87, double* %218, align 8
  store double %246, double* %219, align 8
  store double %249, double* %220, align 8
  %exitcond = icmp eq i64 %216, 100
  br i1 %exitcond, label %bb12.loopexit.loopexit, label %bb22

And the slow one looks like (notice the additional, un-extracted stores - looks like some aliasing issue):

bb7.i:                                            ; preds = %bb7.i, %bb22
  %.in = phi double [ %.pre14.i, %bb22 ], [ %239, %bb7.i ]
  %235 = phi double [ %.pre.i, %bb22 ], [ %250, %bb7.i ]
  %iter.sroa.0.013.i = phi i64 [ 0, %bb22 ], [ %237, %bb7.i ]
  %236 = phi <2 x double> [ %231, %bb22 ], [ %252, %bb7.i ]
  %237 = add nuw i64 %iter.sroa.0.013.i, 1
  %238 = getelementptr inbounds double, double* %buf.sroa.0.0.copyload, i64 %iter.sroa.0.013.i
  %239 = load double, double* %238, align 8
  %240 = fmul double %235, %.pre
  %241 = fmul double %239, %.pre86
  %242 = fmul double %.in, %.pre87
  %243 = fadd double %241, %242
  %244 = fmul <2 x double> %236, %233
  %245 = extractelement <2 x double> %244, i32 0
  %246 = fadd double %243, %245
  %247 = extractelement <2 x double> %244, i32 1
  %248 = fsub double %246, %247
  %249 = fsub double %248, %240
  store double %249, double* %238, align 8
  store double %.in, double* %223, align 8
  store double %239, double* %222, align 8
  %250 = extractelement <2 x double> %236, i32 1
  store double %250, double* %225, align 8
  store double %249, double* %224, align 8
  %exitcond.i = icmp eq i64 %237, %buf.sroa.8.0.copyload
  %251 = insertelement <2 x double> undef, double %.in, i32 0
  %252 = insertelement <2 x double> %251, double %249, i32 1
  br i1 %exitcond.i, label %bb17.backedge, label %bb7.i
bb17.backedge:                                    ; preds = %bb7.i
  %234 = add nuw nsw i64 %iter4.sroa.0.080, 1
  %exitcond = icmp eq i64 %234, 100
  br i1 %exitcond, label %bb12.loopexit.loopexit, label %bb22
arielb1 commented 7 years ago

The 4 extra stores are to the biquads array, and the normal store is to the vector returned from __rust_allocate. It seems that LLVM fails to SROA buf for some reason, which prevents it from noticing the noalias coming from __rust_allocate.

bluss commented 7 years ago

resize's grow loop has already been patched with SetLenOnDrop for an aliasing issue, maybe that now backfires?

arielb1 commented 7 years ago

@bluss

The problem is the missing SROA, which seems to result from an un-inlined call to extend_with_element in Vec::from_elem (extend_with_element is a long function that is actually mostly optimized out after it is inlined, but it is only inlined because Vec::from_elem is its only caller).

In the resize call this happens with Vec::reserve instead. I imagine we would want to move the reserve out of extend_with_element into resize or something.

arielb1 commented 7 years ago

Of course, an interesting question is why are things "fast" on non-MIR trans.

arielb1 commented 7 years ago

This patch fixes the slowness:

diff --git a/src/libcollections/vec.rs b/src/libcollections/vec.rs
index 3134e3c2ce..1e7c4feed3 100644
--- a/src/libcollections/vec.rs
+++ b/src/libcollections/vec.rs
@@ -1206,17 +1206,15 @@ impl<T: Clone> Vec<T> {
         let len = self.len();

         if new_len > len {
-            self.extend_with_element(new_len - len, value);
+            self.reserve(new_len - len);
+            unsafe { self.extend_with_element(new_len - len, value) };
         } else {
             self.truncate(new_len);
         }
     }

     /// Extend the vector by `n` additional clones of `value`.
-    fn extend_with_element(&mut self, n: usize, value: T) {
-        self.reserve(n);
-
-        unsafe {
+    unsafe fn extend_with_element(&mut self, n: usize, value: T) {
             let mut ptr = self.as_mut_ptr().offset(self.len() as isize);
             // Use SetLenOnDrop to work around bug where compiler
             // may not realize the store through `ptr` through self.set_len()
@@ -1238,7 +1236,6 @@ impl<T: Clone> Vec<T> {
             }

             // len set by scope guard
-        }
     }

     /// Clones and appends all elements in a slice to the `Vec`.
@@ -1345,7 +1342,11 @@ impl<T: PartialEq> Vec<T> {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn from_elem<T: Clone>(elem: T, n: usize) -> Vec<T> {
     let mut v = Vec::with_capacity(n);
-    v.extend_with_element(n, elem);
+    unsafe {
+        // This is safe because we allocated the Vec with enough
+        // capacity.
+        v.extend_with_element(n, elem);
+    }
     v
 }

I'll still want to check what pre-MIR is doing in this context to ensure we are not missing anything important.

supercurio commented 7 years ago

Thanks @arielb1 and @bluss! I'll try this patch also with other similar code where the same issue was also observed.

supercurio commented 7 years ago

@arielb1 I wasn't able to confirm that the patch fixed the slowness, with identical results on patched rustc x86-64 1.15.1 source running on Core i5-750. It's my first time testing a rustc built from source tho so I'll double check. I'll test on Raspberry Pi 3 also.

supercurio commented 7 years ago

Now confirmed working with patched rustc 1.15.1x86-64 on i5-750 !

My mistake in the previous test might have been to re-compile and x.py dist an already compiled rustc repo after applying the patch. Maybe not all the necessary elements were re-built that way. Compilation on Pi 3 is on its way, I look forward to confirm on this platform as well :)

supercurio commented 7 years ago

Confirmed the fix is successful on Raspberry Pi 3 also. Thank you very much @arielb1 , the performance is fantastic!

bluss commented 7 years ago

Thanks, the confirmation of the fix is valuable information. This should open until a fix is merged in.

arielb1 commented 7 years ago

It seems that the IR created by MIR contains cross-basic-block memsets that confuse LLVM. Running opt-3.9 -S -memcpyopt -bdce -lcssa -licm on the slow code makes it fast again, and I can't find memcpyopt & bdce on our pass list.

arielb1 commented 7 years ago

Paradoxically, the reason LICM makes things fast is because it replaces code in the style of:

  %37 = getelementptr inbounds %Biquad, %Biquad* %35, i64 1
  %38 = bitcast %Biquad* %37 to i8*
  call void @llvm.memset.p0i8.i64(i8* %38, i8 0, i64 72, i32 8, i1 false)
  %39 = getelementptr inbounds %Biquad, %Biquad* %37, i64 1
  %40 = bitcast %Biquad* %39 to i8*
  call void @llvm.memset.p0i8.i64(i8* %40, i8 0, i64 72, i32 8, i1 false)
  %41 = getelementptr inbounds %Biquad, %Biquad* %39, i64 1
  %42 = bitcast %Biquad* %41 to i8*
  call void @llvm.memset.p0i8.i64(i8* %42, i8 0, i64 72, i32 8, i1 false)
  %43 = getelementptr inbounds %Biquad, %Biquad* %41, i64 1
  %44 = bitcast %Biquad* %43 to i8*

With code in the style of

  call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 144, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %18, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %21, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %23, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %25, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %27, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %29, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %31, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %33, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %35, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %37, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %39, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %43, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %45, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %47, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %49, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %51, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %53, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %55, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %57, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %59, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %61, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %63, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %65, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %67, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %69, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %71, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %73, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %75, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %77, i8 0, i64 72, i32 8, i1 false)
  call void @llvm.memset.p0i8.i64(i8* %79, i8 0, i64 72, i32 8, i1 false)

Which LLVM surprisingly enough handles better. I suppose MemCpyOpt needs a "merge adjacent memset" optimization.

QIvan commented 4 years ago

Hi! sorry, is there any news about this problem in 2020? Thanks!

clubby789 commented 1 month ago

131953 suggests that the suggested fix here no longer has a noticeable impact (few results, within noise threshold), which indicates we've gotten better at optimising this in the last 8 years.