llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.78k stars 11.9k forks source link

(a & C0) != 0 || (a & C1) != 0 Doesn't Always Become (a & (C0 | C1)) != 0 #92333

Open Geotale opened 5 months ago

Geotale commented 5 months ago

Example code:

#[no_mangle]
pub fn src(a: u8) -> u8 {
    1 + ((a & 4 != 0 || a & 1 != 0) as u8)
}

#[no_mangle]
pub fn tgt(a: u8) -> u8 {
    1 + ((a & 5 != 0) as u8)
}

Resulting in the IR:

define noundef i8 @src(i8 noundef %a) unnamed_addr {
start:
  %_4 = and i8 %a, 4
  %0 = icmp eq i8 %_4, 0
  %1 = and i8 %a, 1
  %2 = add nuw nsw i8 %1, 1
  %_3.sroa.0.0 = select i1 %0, i8 %2, i8 2
  ret i8 %_3.sroa.0.0
}

define noundef i8 @tgt(i8 noundef %a) unnamed_addr {
start:
  %_4 = and i8 %a, 5
  %_3.not = icmp eq i8 %_4, 0
  %_0 = select i1 %_3.not, i8 1, i8 2
  ret i8 %_0
}

This specific pattern is never caught, but the cause for it is due to the original comparisons lying on different branches which InstCombine didn't account for In many situations, missing this optimization causes multiple unnecessary branches to occur (the code I was testing with did (a & !3) + 4 * ((a & 2 != 0 && (a & 4 != 0 || a & 1 != 0)) as u8), which similarly never gets the chance to optimize this away due to the separate branches, but they remain on different branches the entire time!!)

Tested with nightly Rustc on Godbolt Godbolt: https://godbolt.org/z/K5GP5dGq1 Alive2: https://alive2.llvm.org/ce/z/evhsnx

wangbyby commented 5 months ago

Maybe the type cast caused the missing opt? If code like this

#[no_mangle]
pub fn src(a: u8) -> bool {
    let b = (a & 4 != 0 || a & 1 != 0) ;
    b
}

#[no_mangle]
pub fn tgt(a: u8) -> bool {
    a & 5 != 0
}

Would resulting:

@tgt = unnamed_addr alias i1 (i8), ptr @src

define noundef zeroext i1 @src(i8 noundef %a) unnamed_addr {
start:
  %0 = and i8 %a, 5
  %b.0 = icmp ne i8 %0, 0
  ret i1 %b.0
}

Godbolt: https://godbolt.org/z/79YoYd1GE

Geotale commented 5 months ago

With that change the same branching thing occurs, but it seems like the second run of InstCombine could catch the pattern of

  %_3 = and i8 %a, 4
  %0 = icmp eq i8 %_3, 0
  %1 = and i8 %a, 1
  %2 = icmp ne i8 %1, 0
  %b.sroa.0.0 = select i1 %0, i1 %2, i1 true
  ret i1 %b.sroa.0.0

but not the case using add instead :o

wangbyby commented 4 months ago

And I test C++ code

int src(int num) {
    return 2 + (((num & 4) != 0 )|| ((num & 1) != 0));
}

with below output

define dso_local noundef i32 @src(int)(i32 noundef %num) local_unnamed_addr {
entry:
  %0 = and i32 %num, 5
  %.not = icmp eq i32 %0, 0
  %add = select i1 %.not, i32 2, i32 3
  ret i32 %add
}

https://godbolt.org/z/dMfnfnf6z


And print ir in rustc

define noundef i8 @src(i8 noundef %a, i8 noundef %_c) unnamed_addr {
  %_5 = and i8 %a, 4
  %0 = icmp eq i8 %_5, 0
  %1 = and i8 %a, 1
  %2 = add nuw nsw i8 %1, 1
  %_4.sroa.0.0 = select i1 %0, i8 %2, i8 2
  ret i8 %_4.sroa.0.0
}

https://godbolt.org/z/T9vhEnv4n

So, I think maybe the rustc do some Constant Propagation and cause the missed opt?

Geotale commented 4 months ago

Comparing the very initial IRs could give a hint I think? :o

define noundef i32 @src(i32 noundef %a) unnamed_addr {
start:
  %_3 = alloca [1 x i8], align 1
  call void @llvm.lifetime.start.p0(i64 1, ptr %_3)
  %_4 = and i32 %a, 4
  %0 = icmp eq i32 %_4, 0
  br i1 %0, label %bb2, label %bb1

bb2:                                              ; preds = %start
  %_5 = and i32 %a, 1
  %1 = icmp ne i32 %_5, 0
  %2 = zext i1 %1 to i8
  store i8 %2, ptr %_3, align 1
  br label %bb3

bb1:                                              ; preds = %start
  store i8 1, ptr %_3, align 1
  br label %bb3

bb3:                                              ; preds = %bb1, %bb2
  %3 = load i8, ptr %_3, align 1
  %4 = trunc i8 %3 to i1
  %_2 = zext i1 %4 to i32
  call void @llvm.lifetime.end.p0(i64 1, ptr %_3)
  %_0 = add i32 1, %_2
  ret i32 %_0
}

in Rust, vs C++'s

define dso_local noundef i32 @src(i32 noundef %num) {
entry:
  %num.addr = alloca i32, align 4
  store i32 %num, ptr %num.addr, align 4
  %0 = load i32, ptr %num.addr, align 4
  %and = and i32 %0, 4
  %cmp = icmp ne i32 %and, 0
  br i1 %cmp, label %lor.end, label %lor.rhs

lor.rhs:                                          ; preds = %entry
  %1 = load i32, ptr %num.addr, align 4
  %and1 = and i32 %1, 1
  %cmp2 = icmp ne i32 %and1, 0
  br label %lor.end

lor.end:                                          ; preds = %lor.rhs, %entry
  %2 = phi i1 [ true, %entry ], [ %cmp2, %lor.rhs ]
  %conv = zext i1 %2 to i32
  %add = add nsw i32 2, %conv
  ret i32 %add
}

Due to C++ using phi, SimplifyCFGPass can run before even SROA, leaving Rust's with multiple labels while C++'s only has one, which is quickly simplified into the optimal code by InstCombine in C++. The big difference is that by the time Rust's is a single label it's the entire addition-select thing, vs C++ where it's something more expected for InstCombine to handle I think ^^

define dso_local noundef i32 @src(i32 noundef %num) local_unnamed_addr {
entry:
  %and = and i32 %num, 4
  %cmp = icmp ne i32 %and, 0
  %and1 = and i32 %num, 1
  %cmp2 = icmp ne i32 %and1, 0
  %0 = select i1 %cmp, i1 true, i1 %cmp2
  %conv = zext i1 %0 to i32
  %add = add nuw nsw i32 2, %conv
  ret i32 %add
}