Closed supercurio closed 1 month ago
Can confirm.
Semi-minified code (not to depend on formatters which generate tons of junk IR - plz fix @eddyb by introducing ):
#![crate_type="rlib"]
/// Audio sample rate for the test set, used for realtime speed
/// calculation
const SAMPLE_RATE: f64 = 48000.0;
/// Total length of samples the filter benchmarks are ran on
const SAMPLE_COUNT: u64 = 524288;
/// Select how many IIR filters should be applied consecutively
/// on each buffer during the benchmark
const FILTER_COUNT: usize = 100;
const BUFFER_LEN: usize = 128;
/// 2nd order biquad filter
#[derive(Copy)]
struct Biquad {
b0: f64,
b1: f64,
b2: f64,
a1: f64,
a2: f64,
x1: f64,
x2: f64,
y1: f64,
y2: f64,
}
impl Clone for Biquad {
fn clone(&self) -> Biquad {
*self
}
}
impl Biquad {
fn new() -> Biquad {
Biquad {
b0: 0.0,
b1: 0.0,
b2: 0.0,
a1: 0.0,
a2: 0.0,
x1: 0.0,
x2: 0.0,
y1: 0.0,
y2: 0.0,
}
}
}
fn iir(buf: &mut [f64], bq: &mut Biquad) {
for i in 0..buf.len() {
let x = buf[i];
buf[i] = (bq.b0 * x) + (bq.b1 * bq.x1) + (bq.b2 * bq.x2) - (bq.a1 * bq.y1) -
(bq.a2 * bq.y2);
bq.x2 = bq.x1;
bq.x1 = x;
bq.y2 = bq.y1;
bq.y1 = buf[i];
}
}
#[cfg(slow)]
pub fn foo() {
println!("Create an empty vector, resized then discarded");
let mut vec_test: Vec<f64> = Vec::new();
vec_test.resize(1234, 0.0);
}
#[inline(never)]
pub fn sample(buffer_len: usize) {
let buffer_count = SAMPLE_COUNT / buffer_len as u64;
for _ in 0..10 {
let mut buf = vec![0.0; buffer_len];
let mut biquads = [Biquad::new(); FILTER_COUNT];
for _ in 0..buffer_count {
for f in 0..FILTER_COUNT {
iir(buf.as_mut_slice(), &mut biquads[f]);
}
}
}
}
The fast inner loop looks like:
bb7.i: ; preds = %bb7.i, %bb22
%.in87 = phi double [ %.pre14.i, %bb22 ], [ %236, %bb7.i ]
%232 = phi double [ %.pre.i, %bb22 ], [ %249, %bb7.i ]
%iter.sroa.0.013.i = phi i64 [ 0, %bb22 ], [ %234, %bb7.i ]
%233 = phi <2 x double> [ %226, %bb22 ], [ %248, %bb7.i ]
%234 = add nuw i64 %iter.sroa.0.013.i, 1
%235 = getelementptr inbounds double, double* %212, i64 %iter.sroa.0.013.i
%236 = load double, double* %235, align 8
%237 = fmul double %232, %227
%238 = fmul double %236, %228
%239 = fmul double %.in87, %229
%240 = fadd double %238, %239
%241 = fmul <2 x double> %233, %231
%242 = extractelement <2 x double> %241, i32 0
%243 = fadd double %240, %242
%244 = extractelement <2 x double> %241, i32 1
%245 = fsub double %243, %244
%246 = fsub double %245, %237
store double %246, double* %235, align 8
%exitcond.i = icmp eq i64 %234, %local_len.sroa.4.0.lcssa26.i.i
%247 = insertelement <2 x double> undef, double %.in87, i32 0
%248 = insertelement <2 x double> %247, double %246, i32 1
%249 = extractelement <2 x double> %233, i32 1
br i1 %exitcond.i, label %bb17.loopexit, label %bb7.i
bb17.loopexit: ; preds = %bb7.i
store double %236, double* %217, align 8
%216 = add nuw nsw i64 %iter4.sroa.0.080, 1
store double %.in87, double* %218, align 8
store double %246, double* %219, align 8
store double %249, double* %220, align 8
%exitcond = icmp eq i64 %216, 100
br i1 %exitcond, label %bb12.loopexit.loopexit, label %bb22
And the slow one looks like (notice the additional, un-extracted stores - looks like some aliasing issue):
bb7.i: ; preds = %bb7.i, %bb22
%.in = phi double [ %.pre14.i, %bb22 ], [ %239, %bb7.i ]
%235 = phi double [ %.pre.i, %bb22 ], [ %250, %bb7.i ]
%iter.sroa.0.013.i = phi i64 [ 0, %bb22 ], [ %237, %bb7.i ]
%236 = phi <2 x double> [ %231, %bb22 ], [ %252, %bb7.i ]
%237 = add nuw i64 %iter.sroa.0.013.i, 1
%238 = getelementptr inbounds double, double* %buf.sroa.0.0.copyload, i64 %iter.sroa.0.013.i
%239 = load double, double* %238, align 8
%240 = fmul double %235, %.pre
%241 = fmul double %239, %.pre86
%242 = fmul double %.in, %.pre87
%243 = fadd double %241, %242
%244 = fmul <2 x double> %236, %233
%245 = extractelement <2 x double> %244, i32 0
%246 = fadd double %243, %245
%247 = extractelement <2 x double> %244, i32 1
%248 = fsub double %246, %247
%249 = fsub double %248, %240
store double %249, double* %238, align 8
store double %.in, double* %223, align 8
store double %239, double* %222, align 8
%250 = extractelement <2 x double> %236, i32 1
store double %250, double* %225, align 8
store double %249, double* %224, align 8
%exitcond.i = icmp eq i64 %237, %buf.sroa.8.0.copyload
%251 = insertelement <2 x double> undef, double %.in, i32 0
%252 = insertelement <2 x double> %251, double %249, i32 1
br i1 %exitcond.i, label %bb17.backedge, label %bb7.i
bb17.backedge: ; preds = %bb7.i
%234 = add nuw nsw i64 %iter4.sroa.0.080, 1
%exitcond = icmp eq i64 %234, 100
br i1 %exitcond, label %bb12.loopexit.loopexit, label %bb22
The 4 extra stores are to the biquads
array, and the normal store is to the vector returned from __rust_allocate
. It seems that LLVM fails to SROA buf
for some reason, which prevents it from noticing the noalias
coming from __rust_allocate
.
resize's grow loop has already been patched with SetLenOnDrop for an aliasing issue, maybe that now backfires?
@bluss
The problem is the missing SROA, which seems to result from an un-inlined call to extend_with_element
in Vec::from_elem
(extend_with_element
is a long function that is actually mostly optimized out after it is inlined, but it is only inlined because Vec::from_elem
is its only caller).
In the resize
call this happens with Vec::reserve
instead. I imagine we would want to move the reserve
out of extend_with_element
into resize
or something.
Of course, an interesting question is why are things "fast" on non-MIR trans.
This patch fixes the slowness:
diff --git a/src/libcollections/vec.rs b/src/libcollections/vec.rs
index 3134e3c2ce..1e7c4feed3 100644
--- a/src/libcollections/vec.rs
+++ b/src/libcollections/vec.rs
@@ -1206,17 +1206,15 @@ impl<T: Clone> Vec<T> {
let len = self.len();
if new_len > len {
- self.extend_with_element(new_len - len, value);
+ self.reserve(new_len - len);
+ unsafe { self.extend_with_element(new_len - len, value) };
} else {
self.truncate(new_len);
}
}
/// Extend the vector by `n` additional clones of `value`.
- fn extend_with_element(&mut self, n: usize, value: T) {
- self.reserve(n);
-
- unsafe {
+ unsafe fn extend_with_element(&mut self, n: usize, value: T) {
let mut ptr = self.as_mut_ptr().offset(self.len() as isize);
// Use SetLenOnDrop to work around bug where compiler
// may not realize the store through `ptr` through self.set_len()
@@ -1238,7 +1236,6 @@ impl<T: Clone> Vec<T> {
}
// len set by scope guard
- }
}
/// Clones and appends all elements in a slice to the `Vec`.
@@ -1345,7 +1342,11 @@ impl<T: PartialEq> Vec<T> {
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_elem<T: Clone>(elem: T, n: usize) -> Vec<T> {
let mut v = Vec::with_capacity(n);
- v.extend_with_element(n, elem);
+ unsafe {
+ // This is safe because we allocated the Vec with enough
+ // capacity.
+ v.extend_with_element(n, elem);
+ }
v
}
I'll still want to check what pre-MIR is doing in this context to ensure we are not missing anything important.
Thanks @arielb1 and @bluss! I'll try this patch also with other similar code where the same issue was also observed.
@arielb1 I wasn't able to confirm that the patch fixed the slowness, with identical results on patched rustc x86-64 1.15.1 source running on Core i5-750. It's my first time testing a rustc built from source tho so I'll double check. I'll test on Raspberry Pi 3 also.
Now confirmed working with patched rustc 1.15.1x86-64 on i5-750 !
My mistake in the previous test might have been to re-compile and x.py dist an already compiled rustc repo after applying the patch. Maybe not all the necessary elements were re-built that way. Compilation on Pi 3 is on its way, I look forward to confirm on this platform as well :)
Confirmed the fix is successful on Raspberry Pi 3 also. Thank you very much @arielb1 , the performance is fantastic!
Thanks, the confirmation of the fix is valuable information. This should open until a fix is merged in.
It seems that the IR created by MIR contains cross-basic-block memsets that confuse LLVM. Running opt-3.9 -S -memcpyopt -bdce -lcssa -licm
on the slow code makes it fast again, and I can't find memcpyopt & bdce on our pass list.
Paradoxically, the reason LICM makes things fast is because it replaces code in the style of:
%37 = getelementptr inbounds %Biquad, %Biquad* %35, i64 1
%38 = bitcast %Biquad* %37 to i8*
call void @llvm.memset.p0i8.i64(i8* %38, i8 0, i64 72, i32 8, i1 false)
%39 = getelementptr inbounds %Biquad, %Biquad* %37, i64 1
%40 = bitcast %Biquad* %39 to i8*
call void @llvm.memset.p0i8.i64(i8* %40, i8 0, i64 72, i32 8, i1 false)
%41 = getelementptr inbounds %Biquad, %Biquad* %39, i64 1
%42 = bitcast %Biquad* %41 to i8*
call void @llvm.memset.p0i8.i64(i8* %42, i8 0, i64 72, i32 8, i1 false)
%43 = getelementptr inbounds %Biquad, %Biquad* %41, i64 1
%44 = bitcast %Biquad* %43 to i8*
With code in the style of
call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 144, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %18, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %21, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %23, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %25, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %27, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %29, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %31, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %33, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %35, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %37, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %39, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %43, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %45, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %47, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %49, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %51, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %53, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %55, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %57, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %59, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %61, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %63, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %65, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %67, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %69, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %71, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %73, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %75, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %77, i8 0, i64 72, i32 8, i1 false)
call void @llvm.memset.p0i8.i64(i8* %79, i8 0, i64 72, i32 8, i1 false)
Which LLVM surprisingly enough handles better. I suppose MemCpyOpt needs a "merge adjacent memset" optimization.
Hi! sorry, is there any news about this problem in 2020? Thanks!
A bug introduced with rustc 1.12.0 with MIR enabled. This was originally reported imprecisely as #40044
If anywhere in the code a vector is either resize() or extend_from_slice(), any algorithms operating on any other vector will run slower. The performance impact varies with the compiler targets and CPU characteristics.
Real world examples measured on an audio DSP algorithm
armv7
Raspberry Pi 3, Rustc 1.12.0: 14% slower Raspberry Pi 3, Rustc 1.15.1: 11% slower Nexus 5, Rustc 1.15: 1% slower
arm
Raspberry Pi, Rustc 1.15.1: 8% slower
x86-64
Core i5-750, Rustc 1.12.1 or Rustc 1.15.1: 0.4% slower
When compiling using Rustc 1.12.0 with MIR disabled, the performance regression does not occur.
Test and demo project
https://github.com/supercurio/rust-issue-mir-vec-slowdown