rust-lang / rust

Empowering everyone to build reliable and efficient software.
https://www.rust-lang.org
Other
96.63k stars 12.48k forks source link

Big performance problem with closed intervals looping #45222

Open leonardo-m opened 6 years ago

leonardo-m commented 6 years ago

In my code I've essentially stopped using loops with ... (intervals closed on the right, recently written with the syntax ..=) because they give performance problems. This is a simple example that shows the problem:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn foo1(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 .. n + 1).rev() {
            count += j;
        }
    }
    count
}

#[inline(never)]
#[no_mangle]
fn foo2(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 ..= n).rev() {
            count += j;
        }
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();
    let what: u32 = std::env::args().nth(2).unwrap().parse().unwrap();

    match what {
        1 => println!("{}", foo1(n)),
        2 => println!("{}", foo2(n)),
        _ => panic!(),
    }
}

Compiled with the last Nightly:

rustc 1.22.0-nightly (d6d711dd8 2017-10-10)
binary: rustc
commit-hash: d6d711dd8f7ad5885294b8e1f0009a23dc1f8b1f
commit-date: 2017-10-10
host: x86_64-pc-windows-gnu
release: 1.22.0-nightly
LLVM version: 4.0

Compiled with: rustc -O test.rs

Running it calling foo1 takes 0.02 seconds:

...>elaps test 100000 1
500005000000000

Running it calling foo2 takes about 13.65 seconds:


...>elaps test 100000 2
500005000000000
The asm I am seeing using "--emit asm" ``` foo1: testq %rcx, %rcx je .LBB5_1 movq %rcx, %r8 imulq %r8, %r8 leaq -1(%rcx), %rdx movq %rcx, %rax mulq %rdx shldq $63, %rax, %rdx subq %rdx, %r8 cmpq $3, %rcx jbe .LBB5_3 movq %rcx, %rdx andq $-4, %rdx je .LBB5_3 movd %r8, %xmm0 pshufd $68, %xmm0, %xmm2 leaq -4(%rdx), %r9 movl %r9d, %eax shrl $2, %eax incl %eax andq $3, %rax je .LBB5_8 cmpq $-1, %rcx pxor %xmm0, %xmm0 pxor %xmm3, %xmm3 je .LBB5_11 movdqa %xmm2, %xmm3 .LBB5_11: negq %rax xorl %r10d, %r10d pxor %xmm1, %xmm1 .p2align 4, 0x90 .LBB5_12: paddq %xmm3, %xmm0 paddq %xmm3, %xmm1 addq $4, %r10 incq %rax jne .LBB5_12 jmp .LBB5_13 .LBB5_3: xorl %eax, %eax xorl %edx, %edx .LBB5_4: xorl %r9d, %r9d cmpq $-1, %rcx cmoveq %r9, %r8 .p2align 4, 0x90 .LBB5_5: incq %rdx addq %r8, %rax cmpq %rcx, %rdx jb .LBB5_5 .LBB5_19: retq .LBB5_1: xorl %eax, %eax retq .LBB5_8: xorl %r10d, %r10d pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 .LBB5_13: cmpq $12, %r9 jb .LBB5_18 cmpq $-1, %rcx pxor %xmm3, %xmm3 je .LBB5_16 movdqa %xmm2, %xmm3 .LBB5_16: movq %rdx, %rax subq %r10, %rax .p2align 4, 0x90 .LBB5_17: paddq %xmm3, %xmm0 paddq %xmm3, %xmm1 paddq %xmm3, %xmm0 paddq %xmm3, %xmm1 paddq %xmm3, %xmm0 paddq %xmm3, %xmm1 paddq %xmm3, %xmm0 paddq %xmm3, %xmm1 addq $-16, %rax jne .LBB5_17 .LBB5_18: paddq %xmm1, %xmm0 pshufd $78, %xmm0, %xmm1 paddq %xmm0, %xmm1 movd %xmm1, %rax cmpq %rcx, %rdx jne .LBB5_4 jmp .LBB5_19 foo2: pushq %rsi pushq %rdi pushq %rbx testq %rcx, %rcx je .LBB6_1 testb $1, %cl jne .LBB6_4 xorl %eax, %eax xorl %r8d, %r8d cmpq $1, %rcx jne .LBB6_11 jmp .LBB6_23 .LBB6_1: xorl %eax, %eax jmp .LBB6_23 .LBB6_4: xorl %r8d, %r8d movq $-1, %r9 xorl %r10d, %r10d movq %rcx, %r11 xorl %eax, %eax jmp .LBB6_5 .p2align 4, 0x90 .LBB6_8: addq %r11, %rax movq %rdi, %r10 movq %rdx, %r11 .LBB6_5: cmpq %r11, %r10 movl $1, %esi cmovbq %r9, %rsi cmoveq %r8, %rsi testq %rsi, %rsi movl $1, %edi movl $0, %edx je .LBB6_8 cmpq $-1, %rsi jne .LBB6_9 leaq -1(%r11), %rdx movq %r10, %rdi jmp .LBB6_8 .LBB6_9: movl $1, %r8d cmpq $1, %rcx je .LBB6_23 .LBB6_11: xorl %r9d, %r9d movq $-1, %r10 .p2align 4, 0x90 .LBB6_12: xorl %r11d, %r11d movq %rcx, %rdx jmp .LBB6_13 .p2align 4, 0x90 .LBB6_16: addq %rdx, %rax movq %rbx, %r11 movq %rsi, %rdx .LBB6_13: cmpq %rdx, %r11 movl $1, %edi cmovbq %r10, %rdi cmoveq %r9, %rdi testq %rdi, %rdi movl $1, %ebx movl $0, %esi je .LBB6_16 cmpq $-1, %rdi jne .LBB6_17 leaq -1(%rdx), %rsi movq %r11, %rbx jmp .LBB6_16 .p2align 4, 0x90 .LBB6_17: addq $2, %r8 xorl %r11d, %r11d movq %rcx, %rdx jmp .LBB6_18 .p2align 4, 0x90 .LBB6_21: addq %rdx, %rax movq %rbx, %r11 movq %rsi, %rdx .LBB6_18: cmpq %rdx, %r11 movl $1, %edi cmovbq %r10, %rdi cmoveq %r9, %rdi testq %rdi, %rdi movl $1, %ebx movl $0, %esi je .LBB6_21 cmpq $-1, %rdi jne .LBB6_22 leaq -1(%rdx), %rsi movq %r11, %rbx jmp .LBB6_21 .p2align 4, 0x90 .LBB6_22: cmpq %rcx, %r8 jb .LBB6_12 .LBB6_23: popq %rbx popq %rdi popq %rsi retq ```
ExpHP commented 6 years ago

Performing the low-hanging fruit for minimization:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn triangle_exc(n: u64) -> u64 {
    let mut count = 0;
    for j in (0 .. n + 1) {
        count += j;
    }
    count
}

#[inline(never)]
#[no_mangle]
fn triangle_inc(n: u64) -> u64 {
    let mut count = 0;
    for j in 0 ..= n {
        count += j;
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();

    println!("{}", triangle_exc(n));
    println!("{}", triangle_inc(n));
}

Good:

//-----------------------------------------
       │     0000000000007300 <triangle_exc>:
       │     triangle_exc():
       │       inc    %rdi
       │     ↓ je     8c
       │       cmp    $0x3,%rdi
       │     ↓ jbe    7b
       │       mov    %rdi,%rcx
       │       and    $0xfffffffffffffffc,%rcx
       │     ↓ je     7b
       │       lea    -0x4(%rcx),%rax
       │       mov    %eax,%esi
       │       shr    $0x2,%esi
       │       inc    %esi
       │       and    $0x3,%rsi
       │     ↓ je     8f
       │       neg    %rsi
       │       mov    $0x1,%edx
       │       movq   %rdx,%xmm0
       │       pslldq $0x8,%xmm0
       │       pxor   %xmm2,%xmm2
       │       xor    %edx,%edx
       │       movdqa _fini+0x44,%xmm3
       │       movdqa _fini+0x54,%xmm4
       │       pxor   %xmm1,%xmm1
       │       data16 nopw %cs:0x0(%rax,%rax,1)
       │ 60:   paddq  %xmm0,%xmm2
       │       paddq  %xmm0,%xmm1
       │       paddq  %xmm3,%xmm1
       │       add    $0x4,%rdx
       │       paddq  %xmm4,%xmm0
       │       inc    %rsi
       │     ↑ jne    60
       │ 7b:   xor    %eax,%eax
       │       xor    %ecx,%ecx
       │       nop
       │ 80:   add    %rcx,%rax
       │       inc    %rcx
       │       cmp    %rdi,%rcx
       │     ↑ jb     80
       │ 8b: ← retq
       │ 8c:   xor    %eax,%eax
       │     ← retq
       │ 8f:   xor    %edx,%edx
       │       mov    $0x1,%esi
       │       movq   %rsi,%xmm0
       │       pslldq $0x8,%xmm0
       │       pxor   %xmm2,%xmm2
       │       pxor   %xmm1,%xmm1
       │ a8:   cmp    $0xc,%rax
       │       movdqa %xmm2,%xmm6
       │     ↓ jb     106
       │       mov    %rcx,%rax
       │       sub    %rdx,%rax
       │       movdqa _fini+0x64,%xmm3
       │       movdqa _fini+0x74,%xmm4
       │       movdqa _fini+0x84,%xmm5
       │ d0:   paddq  %xmm0,%xmm2
       │       paddq  %xmm0,%xmm1
 20.00 │       movdqa %xmm0,%xmm6
 10.00 │       paddq  %xmm6,%xmm6
       │       paddq  %xmm6,%xmm1
 10.00 │       paddq  %xmm2,%xmm6
 30.00 │       paddq  %xmm0,%xmm6
       │       paddq  %xmm0,%xmm1
 10.00 │       paddq  %xmm3,%xmm6
       │       paddq  %xmm4,%xmm1
       │       paddq  %xmm5,%xmm0
 20.00 │       movdqa %xmm6,%xmm2
       │     ↑ jne    d0
       │106:   paddq  %xmm1,%xmm6
       │       pshufd $0x4e,%xmm6,%xmm0
       │       paddq  %xmm6,%xmm0
       │       movq   %xmm0,%rax
       │       cmp    %rcx,%rdi
       │     ↑ jne    80
       │     ↑ jmpq   8b

Bad:

       │    0000000000007430 <triangle_inc>:
       │    triangle_inc():
       │      xor    %r8d,%r8d
       │      mov    $0xffffffffffffffff,%r9
       │      xor    %r10d,%r10d
       │      xor    %eax,%eax
       │    ↓ jmp    29
       │      data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
       │20:   add    %r10,%rax
  1.79 │      mov    %rdx,%rdi
  2.98 │      mov    %rsi,%r10
 17.86 │29:   cmp    %rdi,%r10
       │      mov    $0x1,%ecx
  7.14 │      cmovb  %r9,%rcx
 16.07 │      cmove  %r8,%rcx
  3.57 │      test   %rcx,%rcx
  1.79 │      mov    $0x0,%edx
  1.79 │      mov    $0x1,%esi
 14.29 │    ↑ je     20
       │      cmp    $0xffffffffffffffff,%rcx
       │    ↓ jne    57
  2.98 │      lea    0x1(%r10),%rsi
  3.57 │      mov    %rdi,%rdx
 26.19 │    ↑ jmp    20
       │57: ← retq

Lost some kind of fast lane?

arthurprs commented 6 years ago

Apparently llvm is way happier optimizing the open interval with simd .

kennytm commented 6 years ago

Referring to the example in https://github.com/rust-lang/rust/issues/45222#issuecomment-335998038, the first code is recognized by LLVM loop-vectorize, while the second doesn't.

$ rustc +nightly --crate-type staticlib -C debuginfo=1 -C codegen-units=1 -C opt-level=3 -C panic=abort -C target-cpu=native -C remark=loop-vectorize 1.rs

note: optimization analysis for loop-vectorize at 1.rs:9:0: loop not vectorized: loop control flow is not understood by vectorizer

note: optimization missed for loop-vectorize at 1.rs:9:0: loop not vectorized

After vectorization the .. loop becomes just a * (a+1) / 2, which explains the huge time difference.

If we black-box the j in count += j when benchmarking, the result becomes more realistic (2.6× slowdown, not 680× slowdown):

$ time ./1 100000 1
500005000000000

real    0m5.680s
user    0m5.645s
sys 0m0.012s

$ time ./1 100000 2
500005000000000

real    0m15.088s
user    0m15.015s
sys 0m0.026s

We may see if upgrading to LLVM 6 can help this case.

Also note that the two pieces of code are not equivalent: the n+1 version cannot properly handle the case n == u64::max_value() (although rare).

kennytm commented 6 years ago

I've checked again using the LLVM 6 build. The performance is improved, but there is still a large gap (2.6× → 2.0×).

Vectorization is still not recognized for 0 ..= n with the naked j.

Timings ```sh $ rustc +nightly -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs $ time ./1 30000 2 13500450000000 real 0m5.389s user 0m5.136s sys 0m0.012s $ time ./1 30000 1 13500450000000 real 0m2.195s user 0m2.192s sys 0m0.000s $ rustc +38bd38147d2fa21f8a684b019fc0763adf8fd436 -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs $ time ./1 30000 2 13500450000000 real 0m4.445s user 0m4.332s sys 0m0.000s $ time ./1 30000 1 13500450000000 real 0m2.330s user 0m2.184s sys 0m0.016s ```
scottmcm commented 6 years ago

This might just be the classic external-iteration-is-slower-sometimes problem. Note that even (0..=n).sum() is currently generating unfortunate code.

Fix for internal iteration is up at https://github.com/rust-lang/rust/pull/48012

ollie27 commented 6 years ago

I believe this can be fixed by adding an extra field to RangeInclusive like this:

struct FixedRangeInclusive {
    start: u64,
    end: u64,
    done: bool,
}

fn fixed_range_inclusive(start: u64, end: u64) -> FixedRangeInclusive {
    FixedRangeInclusive {
        start,
        end,
        done: false,
    }
}

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if !self.done {
            if self.start == self.end {
                self.done = true;
            }
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else {
            None
        }
    }
}

Check out the assembly on the playground.

kennytm commented 6 years ago

The current two-field RangeInclusive follows from rust-lang/rfcs#1980. The done field was the original design in RFC 1192 but was changed due to https://github.com/rust-lang/rfcs/pull/1192#issuecomment-137864421.

ollie27 commented 6 years ago

I'm aware that RangeInclusive has gone through many different designs but the current design was clearly not chosen with performance in mind. Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

ExpHP commented 6 years ago

Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

This would be jarring, in consideration of the fact that Range does have public data members.

It is unfortunate. Were this not the case I could almost picture something like this:

pub struct RangeInclusive<T> {
    // NOTE: not pub
    start: T, // actually, these should probably be ManuallyDrop<T> 
    end: T,   // or union MaybeUninit<T> { value: T, empty: () }
    done: bool,
}

impl RangeInclusive<T> {
    // Expose an API that matches the functionality of the enum type
    #[inline] pub fn new(start: T, end: T) -> Self { ... }
    #[inline] pub fn new_done() -> Self { ... }
    #[inline] pub fn endpoints(&self) -> Option<(&T, &T)> { ... }
    #[inline] pub fn endpoints_mut(&mut self) -> Option<(&mut T, &mut T)> { ... }
    #[inline] pub fn into_endpoints(self) -> Option<(T, T)> { ... }
}

and ISTM (note: haven't tested) that this should optimize just as well as the three-field struct, since it IS the three-field struct (just with statically enforced usage patterns). But I suppose that, even then, it would seem questionable to have a standard library type that simulates a enum (rather than being one) solely for performance concerns.


Edit: I misread somewhat and thought that the enum was the current proposal.

scottmcm commented 6 years ago

@leonardo-m Can you share some non-simple examples where the current form is a problem? https://github.com/rust-lang/rust/pull/48012 will make it so that the example in here is fine if written the easier way count += (0 ..= n).sum().

For simple things like sums of iota, it's easy to get suboptimal codegen from all kinds of different iterators. Like using for x in (0..3).chain(3..x) to add things instead of folding the same iterator has the identical problem as was raised here: https://godbolt.org/g/tYt7TX

Edit: https://github.com/rust-lang/rust/pull/48057 has also improved things in recent nightlies, though it's still not perfect.

kennytm commented 6 years ago

For record: Enabling Polly (#50044) doesn't fix the issue.

Stargateur commented 6 years ago

@ollie27 Just a note, if you are going to use a bool your example always do two tests, maybe something like this could speed up thing because it only test two time for the two last value:

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if self.start < self.end {
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else if !self.done {
            self.done = true;
            Some(self.start)
        } else {
            None
        }
    }
}
leonardo-m commented 5 years ago

To answer Issue #56516, this is a first example of the performance problem, better examples could follow. Code example from: http://ericniebler.com/2014/04/27/range-comprehensions/

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1 .. z + 1)
                        .flat_map(move |x| (x .. z + 1u32)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result); // 10650478, about 2.8 seconds.
}

If I replace the open intervals with closed ones:

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1u32 ..= z)
                        .flat_map(move |x| (x ..= z)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result);
}

For the second version I am seeing a run-time of about 6 seconds.

leonardo-m commented 5 years ago

Lot of discussion here:

https://old.reddit.com/r/rust/comments/ab7hsi/comparing_pythagorean_triples_in_c_d_and_rust/

frol commented 5 years ago

58122 has significantly improved the situation!

My benchmark based on the @leonardo-m snippet

```rust #![feature(test)] extern crate test; use test::Bencher; fn triples_exclusive() -> impl Iterator { (1u32..).flat_map(|z| { (1..z + 1).flat_map(move |x| { (x..z + 1) .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2)) .map(move |y| (x, y, z)) }) }) } fn triples_inclusive() -> impl Iterator { (1u32..).flat_map(|z| { (1..=z).flat_map(move |x| { (x..=z) .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2)) .map(move |y| (x, y, z)) }) }) } #[bench] fn range_exclusive(b: &mut Bencher) { b.iter(|| triples_exclusive().take(1_000).map(|(x, y, z)| x + y + z).sum::()); } #[bench] fn range_inclusive(b: &mut Bencher) { b.iter(|| triples_inclusive().take(1_000).map(|(x, y, z)| x + y + z).sum::()); } ```

Here are the results on my laptop:

Run 1:

test range_exclusive ... bench: 169,132,205 ns/iter (+/- 9,048,718)
test range_inclusive ... bench: 173,425,958 ns/iter (+/- 16,198,459)

Run 2:

test range_exclusive ... bench: 172,896,754 ns/iter (+/- 12,204,477)
test range_inclusive ... bench: 174,383,382 ns/iter (+/- 12,032,995)

Run 3:

test range_exclusive ... bench: 169,798,685 ns/iter (+/- 11,155,761)
test range_inclusive ... bench: 174,589,161 ns/iter (+/- 13,358,378)

Inclusive Range is still consistently slower than the exclusive variant, which is not that significant some might say, but it may end up to be a "clever" optimization trick like so many of them in C++ world. Also, in my opinion, the existance of this performance difference goes against "zero-cost" claims, so I think this issue should be kept open.

/cc @matthieu-m

leonardo-m commented 5 years ago

The percentage performance difference I see in my code is quite larger than the 1-2% difference shown above. So better benchmarks are necessary.

matthieu-m commented 5 years ago

@frol

I agree that the performance drop between exclusive and inclusive is annoying, and should ideally be eliminated if possible, however I don't see that as a failure of zero-overhead abstraction.

If you were coding the inclusive range with a for loop, you would also need to handle either the starting/ending bound specially, leading to a different codegen than for exclusive ranges.

For me there are two issues:

The API is water under the bridge for stability reasons, so we need to concentrate on the codegen issues. I think the next step should be involving folks knowledgeable about LLVM, who could understand why the optimizer fails so hard and either point to a specific pattern to avoid or improve LLVM so it doesn't.

I won't be following this up, as I have other projects, so I invite anybody interested to pick up the flag and carry on.


@leonardo-m

For these benchmarks specifically? In my benchmarks I was seeing less than 1% between exclusive and inclusive, which is consistent with what is reported here.

Note that using external iteration (a for loop) may not optimize as well as using internal iteration.

Stargateur commented 5 years ago

@frol

Also, in my opinion, the existance of this performance difference goes against "zero-cost" claims

I disagree, 0..5 != 0..=4, the zero cost is about the difference if I would manually do a loop with inclusive range like with a u8 from 0 to 255

saethlin commented 2 years ago

:wave: It's been more than a year since the last comment on this issue.

The original example has been well-optimized since inclusive ranges were stabilized. They now look like this: https://godbolt.org/z/oh9WbbacE

example::triangle_exc:
        cmp     rdi, -1
        je      .LBB0_1
        lea     rcx, [rdi - 1]
        mov     rax, rdi
        mul     rcx
        shld    rdx, rax, 63
        add     rdx, rdi
        mov     rax, rdx
        ret
.LBB0_1:
        xor     edx, edx
        mov     rax, rdx
        ret

example::triangle_inc:
        xor     eax, eax
        xor     ecx, ecx
.LBB1_1:
        mov     rdx, rcx
        cmp     rcx, rdi
        adc     rcx, 0
        add     rax, rdx
        cmp     rdx, rdi
        jae     .LBB1_3
        cmp     rcx, rdi
        jbe     .LBB1_1
.LBB1_3:
        ret

Eric Niebler's pythagorean triples benchmark also has different behavior than reported here. Run with the current nightly on a recent x86_64 CPU, default release profile, there is a consistent (small) preference for inclusive ranges:

test range_exclusive ... bench: 109,594,481 ns/iter (+/- 581,839)
test range_inclusive ... bench: 108,711,065 ns/iter (+/- 502,620)

But the situation is muddied by adding any kind of optimization flag. Here's codgen-units = 1:

test range_exclusive ... bench: 109,135,628 ns/iter (+/- 556,099)
test range_inclusive ... bench: 160,119,738 ns/iter (+/- 180,715)

And here's lto = true:

test range_exclusive ... bench: 161,099,657 ns/iter (+/- 156,865)
test range_inclusive ... bench: 136,806,833 ns/iter (+/- 422,545)

Timings are similarly unstable with the latest stable, 1.56.

The basic result holds up across architectures too; here's the default release profile on an aarch64 laptop:

test range_exclusive ... bench: 429,999,840 ns/iter (+/- 8,341)
test range_inclusive ... bench: 388,264,759 ns/iter (+/- 7,984)

Overall I think different benchmarks are needed here. If anyone has complaints about inclusive range performance, I would like to see a benchmark the demonstrates a consistent regression on a recent compiler.

All these were run on the current nightly (which has NewPM enabled by default):


binary: rustc
commit-hash: 91b931926fd49fc97d1e39f2b8206abf1d77ce7d
commit-date: 2021-10-23
host: x86_64-unknown-linux-gnu
release: 1.58.0-nightly
LLVM version: 13.0.0```
Stargateur commented 2 years ago

The following question on stackoverflow show a big performance hit using RangeInclusive:

From:

use num_integer::Roots;

fn main() {
    let v = 984067;

    for i in 1..=v {
        ramanujan(i)
    }
}

fn ramanujan(m: i32) {
    let maxcube = m.cbrt();
    let mut res1 = 0;
    let mut res2 = 0;
    let mut _res3 = 0;
    let mut _res4 = 0;

    for i in 1..=maxcube {
        for j in 1..=maxcube {
            if i * i * i + j * j * j == m {
                res1 = i;
                res2 = j;
                break;
            }
        }
    }

    for k in 1..=maxcube {
        for l in 1..=maxcube {
            if k == res1 || k == res2 || l == res1 || l == res2 {
                continue;
            }
            if k * k * k + l * l * l == m {
                _res3 = k;
                _res4 = l;
                break;
            }
        }
    }
}

To:

use num_integer::Roots;

fn main() {
    let v = 984067;
    for i in 1..=v {
        ramanujan(i)
    }
}

fn ramanujan(m: i32) {
    let maxcube = m.cbrt() + 1;
    let mut res1 = 0;
    let mut res2 = 0;
    let mut res3 = 0;
    let mut res4 = 0;

    for i in 1..maxcube {
        for j in 1..maxcube {
            if i * i * i + j * j * j == m {
                res1 = i;
                res2 = j;
                break;
            }
        }
    }

    for k in 1..maxcube {
        for l in 1..maxcube {
            if k == res1 || k == res2 || l == res1 || l == res2 {
                continue;
            }
            if k * k * k + l * l * l == m {
                res3 = k;
                res4 = l;
                break;
            }
        }
    }
}

Result:

rustc 1.57.0 (f1edd0429 2021-11-29)
From: 0.01s user 0.01s system 0% cpu 18.108 total
To: 0.00s user 0.01s system 0% cpu 3.549 total

rustc 1.59.0-nightly (efec54529 2021-12-04)
From: 0.01s user 0.00s system 0% cpu 17.993 total
To: 0.00s user 0.01s system 0% cpu 3.494 total

Thus I don't know exactly if this is related to this issue. But since this issue is more or less "problem with RangeInclusive perf" tell me if I need to open and put it in another issue.

saethlin commented 2 years ago

I minimized the above example into:

pub fn example(m: i32, max: i32) -> i32 {
    for i in 1..(max + 1) {
        if i * i == m { 
            return i;
        }   
    }   
    0   
}

pub fn example_inc(m: i32, max: i32) -> i32 {
    for i in 1..=max {
        if i * i == m { 
            return i;
        }   
    }   
    0   
}

Godbolt link: https://godbolt.org/z/nqTveYvzx (updated shortly after posting to make the two functions semantically identical)

I see a 2x runtime difference between the two, using this program to execute them, commenting out the one I don't want:

#![feature(test)]
extern crate test;
use test::bench::black_box;

use num_integer::Roots;

fn main() {
    let v = 984067;

    for i in 1..=v {
        black_box(example(i, i.cbrt()));
    }

    for i in 1..=v {
        black_box(example_inc(i, i.cbrt()));
    }
}
Stargateur commented 2 years ago
use std::ops::ControlFlow;

pub fn example_try_fold(m: i32, max: i32) -> i32 {
  match (1..=max).try_fold(0, |acc, i: i32| {
      if i * i == m { 
          ControlFlow::Break(i)
      }
      else {
        ControlFlow::Continue(acc)
      }
  }) {
      ControlFlow::Break(i) | ControlFlow::Continue(i) => i,
  }
}

produce more or less the same asm than your example().

leonardo-m commented 2 years ago

LLVM is sometimes able to optimize enough if you have a single inclusive loop. The real problem appears when you have two or three nested inclusive loops. In this case LLVM gives loops with bad efficiency. So I've mostly banned inclusive loops in Rust code.

jorgeperezlara commented 2 months ago

I have experienced this first hand while writing a benchmark (see my Reddit post).

How may I suggest having an unsafe inclusive range for this kind of things? I could use exclusive ranges, of course, but I'd have to have the choice for when uint_MAX is reached to be able to manually disable these checks (as with most of Rust's safety abstractions).

kennytm commented 2 months ago

I think #123741 can fix this once and for all (by making RangeInclusive::into_iter() return an optimized-for-iteration structure). No need to introduce an unsafe range.