Comparison of Option<NonZeroU*> is not fully optimized

kennytm commented 6 years ago

Compare:

#![feature(nonzero)]
use std::num::NonZeroU32;

pub fn f(a: Option<NonZeroU32>, b: Option<NonZeroU32>) -> bool {
    a < b
}
pub fn g(a: u32, b: u32) -> bool {
    a < b
}

The functions f and g have equivalent effect, and should compile to the same code. However, the translated f is much more complex, involving needless comparison of the discriminant.

LLVM IR and ASM (in release mode)

```llvm ; playground::f ; Function Attrs: norecurse nounwind readnone uwtable define zeroext i1 @_ZN10playground1f17hd628eff49d2ff60dE(i32, i32) unnamed_addr #0 { start: %2 = icmp ne i32 %0, 0 %3 = icmp ne i32 %1, 0 %4 = xor i1 %2, %3 br i1 %4, label %bb7.i, label %bb6.i bb6.i: ; preds = %start %5 = icmp ult i32 %0, %1 %spec.select = and i1 %2, %5 ret i1 %spec.select bb7.i: ; preds = %start %6 = xor i1 %2, true %7 = and i1 %3, %6 ret i1 %7 } ; playground::g ; Function Attrs: norecurse nounwind readnone uwtable define zeroext i1 @_ZN10playground1g17hea1c394facd055aeE(i32 %a, i32 %b) unnamed_addr #0 { start: %0 = icmp ult i32 %a, %b ret i1 %0 } ``` ```asm playground::f: test edi, edi setne al test esi, esi setne cl xor cl, al je .LBB0_1 test esi, esi setne cl test edi, edi sete al and al, cl ret .LBB0_1: test edi, edi setne cl cmp edi, esi setb al and al, cl ret playground::g: cmp edi, esi setb al ret ```

Note that specializing PartialOrd etc for Option<NonZero*> is not a valid solution, since custom wrapper types of NonZero* will still have the same problem.

#[derive(PartialOrd, PartialEq)]
pub struct S(NonZeroU32);
pub fn f(a: Option<S>, b: Option<S>) -> bool {
    a < b
}

cc #49137.

scottmcm commented 6 years ago

oli-obk commented 6 years ago

I'm assuming we need to change the layout here

https://github.com/rust-lang/rust/blob/5ebf74851d685f75abec7ef4e805f75fc301460c/src/librustc_trans/mir/rvalue.rs#L435

In order to pass a layout that knows that the discriminant is not just an integer, but a subset of an integer.

Also maybe the discriminant intrinsic must be adjusted: https://github.com/rust-lang/rust/blob/31ae4f9fde9b23100c26e5642030128a7e1444ef/src/librustc_trans/intrinsic.rs#L406

Maybe we should add compiler-internal support for subsetting integers? So TypeVariants::Int also has a range field that we can set?

cc @eddyb cc @rust-lang/wg-codegen

hanna-kruppe commented 6 years ago

I don't see how layout factors in, Option<NonZeroU32> is already represented as a single 32 bit integer with 0 as None. The generic PartialOrd is just not fully optimized for this monomorphization. That's an optimization LLVM could just do, it just currently doesn't.

hanna-kruppe commented 6 years ago

To clarify, the Alive link is just to confirm the overall transformation is correct within LLVM IR. Obviously this very specific pattern should not be added verbatim to instcombine (it wouldn't even help since I replaced the branching with select to make Alive accept it). But since this optimization applies only to particular monomorphizations and we can't reasonably special case this in rustc, we should really report this as a LLVM bug, so that they can look into which combination of passes could be improved to catch this.

oli-obk commented 6 years ago

Oh... I assumed that we're missing some assume annotations that would allow this optimization from happening by itself.

eddyb commented 6 years ago

What's happening in <Option as PartialOrd>::le, even? Is this the output of auto-derive? What's the actual code that optimizes to the shown LLVM IR? I'm wondering if this is a derive inefficiency.

kennytm commented 3 years ago

as of 1.57-nightly, the ASM generated for f is becoming even worse.

; playground::f
; Function Attrs: norecurse nounwind nonlazybind readnone uwtable willreturn
define zeroext i1 @_ZN10playground1f17h98eba4f4ca80f23cE(i32 %0, i32 %1) unnamed_addr #0 {
start:
  %2 = icmp ne i32 %0, 0
  %3 = icmp ne i32 %1, 0
  %4 = xor i1 %2, %3
  br i1 %4, label %bb2.i.i, label %bb1.i.i

bb2.i.i:                                          ; preds = %start
  %5 = xor i1 %2, true
  %_3.i.i.i.i = and i1 %3, %5
  %.0.i.i.i.i = select i1 %_3.i.i.i.i, i8 -1, i8 1
  br label %_ZN4core3cmp10PartialOrd2lt17h869f0d8a39125a29E.exit

bb1.i.i:                                          ; preds = %start
  %.not.i.i = icmp eq i32 %0, 0
  %.not5.i.i = icmp eq i32 %1, 0
  %or.cond.i.i = or i1 %.not.i.i, %.not5.i.i
  br i1 %or.cond.i.i, label %_ZN4core3cmp10PartialOrd2lt17h869f0d8a39125a29E.exit, label %bb5.i.i

bb5.i.i:                                          ; preds = %bb1.i.i
  %_3.i.i.i.i.i = icmp ult i32 %0, %1
  %_6.i.i.i.i.i = icmp ne i32 %0, %1
  %spec.select.i.i.i.i.i = zext i1 %_6.i.i.i.i.i to i8
  %.0.i.i.i.i.i = select i1 %_3.i.i.i.i.i, i8 -1, i8 %spec.select.i.i.i.i.i
  br label %_ZN4core3cmp10PartialOrd2lt17h869f0d8a39125a29E.exit

_ZN4core3cmp10PartialOrd2lt17h869f0d8a39125a29E.exit: ; preds = %bb2.i.i, %bb1.i.i, %bb5.i.i
  %.2.i.i = phi i8 [ %.0.i.i.i.i, %bb2.i.i ], [ 0, %bb1.i.i ], [ %.0.i.i.i.i.i, %bb5.i.i ]
  %cond.i = icmp eq i8 %.2.i.i, -1
  ret i1 %cond.i
}

example::f:
        test    edi, edi
        setne   al
        test    esi, esi
        setne   cl
        cmp     al, cl
        je      .LBB0_2
        test    edi, edi
        setne   al
        add     al, al
        add     al, -1
        test    esi, esi
        movzx   ecx, al
        mov     eax, 1
        cmovne  eax, ecx
.LBB0_5:
        cmp     al, -1
        sete    al
        ret
.LBB0_2:
        xor     eax, eax
        test    edi, edi
        je      .LBB0_5
        test    esi, esi
        je      .LBB0_5
        xor     ecx, ecx
        cmp     edi, esi
        setne   cl
        mov     eax, 255
        cmovae  eax, ecx
        cmp     al, -1
        sete    al
        ret

the regression happens on 1.52.0 which upgraded to LLVM 12.

the GCC backend produces less instructions, but still far from ideal:

example::f:
        xor     eax, eax
        test    esi, esi
        je      .L6
        test    edi, edi
        jne     .L12
        mov     eax, 1
        ret
.L12:
        cmp     edi, esi
        setb    al
.L6:
        ret

example::g:
        cmp     edi, esi
        setb    al
        ret

saethlin commented 2 years ago

Most of the above comments are about PartialOrd, but this also afflicts PartialEq. Including on the latest nightly with NewPM.

pub fn h(a: Option<NonZeroU32>, b: Option<NonZeroU32>) -> bool {
    a == b
}

example::h:
        test    edi, edi
        setne   al
        test    esi, esi
        setne   cl
        xor     cl, al
        je      .LBB0_2
        xor     eax, eax
        ret
.LBB0_2:
        test    esi, esi
        sete    al
        test    edi, edi
        sete    cl
        or      cl, al
        cmp     edi, esi
        sete    al
        or      al, cl
        ret

IMO the fact that LLVM can't do the expected optimization to just compare as two u32s is more egregious.

JakobDegen commented 2 years ago

Also not great is this one, but then there's

#[derive(Eq, PartialEq)]
pub enum A {
    A(u8),
    B(u8),
    C(u8),
    D(u8),
    E(u8)
}

pub fn cmp(a: A, b: A) -> bool {
    a == b
}

which currently compiles to what I'm fairly sure is a jump table that all goes to the same place:

example::cmp:
        cmp     dil, dl
        jne     .LBB0_1
        movzx   eax, dil
        lea     rdx, [rip + .LJTI0_0]
        movsxd  rax, dword ptr [rdx + 4*rax]
        add     rax, rdx
        jmp     rax
.LBB0_3:
        cmp     sil, cl
        sete    al
        ret
.LBB0_1:
        xor     eax, eax
        ret
.LJTI0_0:
        .long   .LBB0_3-.LJTI0_0
        .long   .LBB0_3-.LJTI0_0
        .long   .LBB0_3-.LJTI0_0
        .long   .LBB0_3-.LJTI0_0
        .long   .LBB0_3-.LJTI0_0

Edit: Turns out these are unrelated Edit edit: A local fix of the issue on the rustc side shows that LLVM misses the opt even when its allowed, so they are once more related.

JakobDegen commented 2 years ago

I investigated a little and filed an LLVM bug: llvm/llvm-project#52622

curiouscat2018 commented 2 years ago

I don't think we have sufficient information to solve the problem on llvm side. Comparing the two functions:

pub fn less_than_unsigned(a: Option<NonZeroU32>, b: Option<NonZeroU32>) -> bool {
    a < b
}

pub fn less_than_signed(a: Option<NonZeroI32>, b: Option<NonZeroI32>) -> bool {
    a < b
}

llvm-ir generated are almost same, except one line difference of ugt and sgt instruction. However only the first function can be reduced to u32 comparison. The second one can not because None is smaller than any value of Some(NonZeroI32), including zero or negative numbers. Given this fact, how can we know which function the llvm-ir is coming from?

This problem might be solved on std, if we can have a specific PartialEq definition for Option<NonZeroU32>:

impl PartialOrd for Option<NonZeroU32> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        unsafe {
            let a = std::intrinsics::transmute::<&Option<NonZeroU32>, &u32>(self);
            let b = std::intrinsics::transmute::<&Option<NonZeroU32>, &u32>(other);
            a.partial_cmp(b)
        }
    }
}

In order to do this, we need to use specification feature and mark the derived partial_cmp function for Option default. Yet the derive macro does not accept parameters like defaultness=default, which is entirely another topic.

cc @eddyb @oli-obk

JakobDegen commented 2 years ago

@curiouscat2018 I'm not sure what you mean. alive2 says that the transformation would be correct for the unsigned case, so LLVM has all the information it needs to continue optimizing

curiouscat2018 commented 2 years ago

@JakobDegen yes the unsigned case can be correctly transformed, but the signed case can not if you try on alive2. Given only llvm-ir, how can we tell whether it's from signed or unsigned case? alive2 verifies whether the transformation is correct, but not how the transformation can be done. My point is, we might not have enough information to do optimization on llvm side. Does this clarification help ?

JakobDegen commented 2 years ago

I don't understand what you mean. If alive2 says that the transformation is legal, then by definition LLVM has all the information it needs to optimize. This is what an optimizer should do after all: We give it a slow version of the LLVM IR, and it turns it into the fast version that is equivalent. There's nothing else we should need to do to help it.

Specifically, with respect to "how can we tell whether it's from [the] signed or unsigned case?": There's no need to tell where the code came from. Optimizers don't work backwards to try and understand the origin of the code, they work forwards from what they have to try and simplify it further. Of course, sometimes that's really hard, but this is not one of those times. In the LLVM IR from before, it's sound to replace %.not5.i.i with false for example.

saethlin commented 2 years ago

I found a way around this: https://godbolt.org/z/bh9dfWdc1

For anyone on mobile, we have an

#[derive(PartialEq, Clone, Copy)]
pub enum SbTag {
    Tagged(NonZeroU64),
    Untagged,
}

For the purposes of codegen this is an Option<NonZeroU64>. Replacing the derived PartialEq with the below code produces the desired codegen for comparison of SbTag.

impl SbTag {
    fn as_u64(&self) -> u64 {
        match self {
            SbTag::Tagged(tag) => tag.get(),
            SbTag::Untagged => 0,
        }
    }
}

impl PartialEq for SbTag {
    fn eq(&self, other: &Self) -> bool {
        self.as_u64() == other.as_u64()
    }
}

I think the culprit here is how LLVM is understanding the exact arrangement of the logic in the derived PartialEq. Which is this:

    fn eq(&self, other: &SbTag) -> bool {
        {
            let __self_vi = ::core::intrinsics::discriminant_value(&*self);
            let __arg_1_vi = ::core::intrinsics::discriminant_value(&*other);
            if true && __self_vi == __arg_1_vi {
                match (&*self, &*other) {
                    (&SbTag::Tagged(ref __self_0), &SbTag::Tagged(ref __arg_1_0)) => {
                        (*__self_0) == (*__arg_1_0)
                    }
                    _ => true,
                }
            } else {
                false
            }
        }
    }

Adjusting the behavior of the derived PartialEq to match what I did by hand might be a bit tricky. But at least it's clear that there is a way to fix this just in Rust?

curiouscat2018 commented 2 years ago

@saethlin I think we can do similar change for Option. Manual override of PartialEq trait for Option<NonZeroU*> should work. @JakobDegen any idea on this?

saethlin commented 2 years ago

@nnethercote is changing how a lot of the built-in derives work, aiming at improving compile time. I think it would be best to re-evaluate this after his PRs land, just to make sure this is still a problem with them.

nikic commented 2 years ago

For reference, alive link for the simplest case (PartialEq of Option): https://alive2.llvm.org/ce/z/DMtwsK

I've been staring at this, but don't really see any obvious sequence of simple transforms that would lead to the desired result. It would be viable to fold select i1 %cmp1, i1 %cmp2, i1 false to either %cmp1 or %cmp2, but that doesn't make things substantially simpler. Having a xor as dominating condition is tough.

nnethercote commented 2 years ago

My changes don't affect eq for enums, other than getting rid of the redundant true && in the discriminant check. (My changes affect eq for structs a lot more.)

scottmcm commented 1 year ago

@nikic Any ideas if we rewrote Option::eq to give you a different dominating condition? Like we could try putting the payload comparison behind an &, maybe https://rust.godbolt.org/z/vKMa4zzsx. Alive again says the transformation is legal https://alive2.llvm.org/ce/z/PuZ-SZ, though I also don't see an obvious path to get there.

EDIT: Actually, I found one thing that opt doesn't do today (https://github.com/llvm/llvm-project/issues/58624), at least.

mlindner commented 2 months ago

kennytm commented 2 months ago

as of the current latest Rust (1.82-nightly), thanks to #122024,

use std::num::NonZeroU32;
#[no_mangle]
pub fn f(a: Option<NonZeroU32>, b: Option<NonZeroU32>) -> bool {
    a < b
}
#[no_mangle]
pub fn g(a: u32, b: u32) -> bool {
    a < b
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable
define noundef zeroext i1 @f(i32 noundef %0, i32 noundef %1) unnamed_addr #0 {
start:
  %2 = icmp eq i32 %0, 0
  %3 = icmp ne i32 %1, 0
  %4 = icmp ult i32 %0, %1
  %_3.sroa.0.0 = select i1 %2, i1 %3, i1 %4
  ret i1 %_3.sroa.0.0
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable
define noundef zeroext i1 @g(i32 noundef %a, i32 noundef %b) unnamed_addr #0 {
start:
  %_0 = icmp ult i32 %a, %b
  ret i1 %_0
}

f:                                      # @f
# %bb.0:
    xor eax, eax
    test    esi, esi
    setne   al
    xor ecx, ecx
    cmp edi, esi
    setb    cl
    test    edi, edi
    cmovne  eax, ecx
                                        # kill: def $al killed $al killed $eax
    ret
                                        # -- End function

g:                                      # @g
# %bb.0:
    cmp edi, esi
    setb    al
    ret
                                        # -- End function

That is, there still remains a redundant a == None comparison

if a == 0 {
    b != 0
} else {
    a < b
}

danilaml commented 2 months ago

@kennytm looks like missed optimization in upstream llvm: https://alive2.llvm.org/ce/z/gFYb4q

rust-lang / rust

Comparison of Option<NonZeroU*> is not fully optimized #49892