Open Geotale opened 5 months ago
Maybe the type cast caused the missing opt? If code like this
#[no_mangle]
pub fn src(a: u8) -> bool {
let b = (a & 4 != 0 || a & 1 != 0) ;
b
}
#[no_mangle]
pub fn tgt(a: u8) -> bool {
a & 5 != 0
}
Would resulting:
@tgt = unnamed_addr alias i1 (i8), ptr @src
define noundef zeroext i1 @src(i8 noundef %a) unnamed_addr {
start:
%0 = and i8 %a, 5
%b.0 = icmp ne i8 %0, 0
ret i1 %b.0
}
Godbolt: https://godbolt.org/z/79YoYd1GE
With that change the same branching thing occurs, but it seems like the second run of InstCombine could catch the pattern of
%_3 = and i8 %a, 4
%0 = icmp eq i8 %_3, 0
%1 = and i8 %a, 1
%2 = icmp ne i8 %1, 0
%b.sroa.0.0 = select i1 %0, i1 %2, i1 true
ret i1 %b.sroa.0.0
but not the case using add
instead :o
And I test C++ code
int src(int num) {
return 2 + (((num & 4) != 0 )|| ((num & 1) != 0));
}
with below output
define dso_local noundef i32 @src(int)(i32 noundef %num) local_unnamed_addr {
entry:
%0 = and i32 %num, 5
%.not = icmp eq i32 %0, 0
%add = select i1 %.not, i32 2, i32 3
ret i32 %add
}
https://godbolt.org/z/dMfnfnf6z
And print ir in rustc
define noundef i8 @src(i8 noundef %a, i8 noundef %_c) unnamed_addr {
%_5 = and i8 %a, 4
%0 = icmp eq i8 %_5, 0
%1 = and i8 %a, 1
%2 = add nuw nsw i8 %1, 1
%_4.sroa.0.0 = select i1 %0, i8 %2, i8 2
ret i8 %_4.sroa.0.0
}
https://godbolt.org/z/T9vhEnv4n
So, I think maybe the rustc do some Constant Propagation and cause the missed opt?
Comparing the very initial IRs could give a hint I think? :o
define noundef i32 @src(i32 noundef %a) unnamed_addr {
start:
%_3 = alloca [1 x i8], align 1
call void @llvm.lifetime.start.p0(i64 1, ptr %_3)
%_4 = and i32 %a, 4
%0 = icmp eq i32 %_4, 0
br i1 %0, label %bb2, label %bb1
bb2: ; preds = %start
%_5 = and i32 %a, 1
%1 = icmp ne i32 %_5, 0
%2 = zext i1 %1 to i8
store i8 %2, ptr %_3, align 1
br label %bb3
bb1: ; preds = %start
store i8 1, ptr %_3, align 1
br label %bb3
bb3: ; preds = %bb1, %bb2
%3 = load i8, ptr %_3, align 1
%4 = trunc i8 %3 to i1
%_2 = zext i1 %4 to i32
call void @llvm.lifetime.end.p0(i64 1, ptr %_3)
%_0 = add i32 1, %_2
ret i32 %_0
}
in Rust, vs C++'s
define dso_local noundef i32 @src(i32 noundef %num) {
entry:
%num.addr = alloca i32, align 4
store i32 %num, ptr %num.addr, align 4
%0 = load i32, ptr %num.addr, align 4
%and = and i32 %0, 4
%cmp = icmp ne i32 %and, 0
br i1 %cmp, label %lor.end, label %lor.rhs
lor.rhs: ; preds = %entry
%1 = load i32, ptr %num.addr, align 4
%and1 = and i32 %1, 1
%cmp2 = icmp ne i32 %and1, 0
br label %lor.end
lor.end: ; preds = %lor.rhs, %entry
%2 = phi i1 [ true, %entry ], [ %cmp2, %lor.rhs ]
%conv = zext i1 %2 to i32
%add = add nsw i32 2, %conv
ret i32 %add
}
Due to C++ using phi, SimplifyCFGPass can run before even SROA, leaving Rust's with multiple labels while C++'s only has one, which is quickly simplified into the optimal code by InstCombine in C++. The big difference is that by the time Rust's is a single label it's the entire addition-select thing, vs C++ where it's something more expected for InstCombine to handle I think ^^
define dso_local noundef i32 @src(i32 noundef %num) local_unnamed_addr {
entry:
%and = and i32 %num, 4
%cmp = icmp ne i32 %and, 0
%and1 = and i32 %num, 1
%cmp2 = icmp ne i32 %and1, 0
%0 = select i1 %cmp, i1 true, i1 %cmp2
%conv = zext i1 %0 to i32
%add = add nuw nsw i32 2, %conv
ret i32 %add
}
Example code:
Resulting in the IR:
This specific pattern is never caught, but the cause for it is due to the original comparisons lying on different branches which InstCombine didn't account for In many situations, missing this optimization causes multiple unnecessary branches to occur (the code I was testing with did
(a & !3) + 4 * ((a & 2 != 0 && (a & 4 != 0 || a & 1 != 0)) as u8)
, which similarly never gets the chance to optimize this away due to the separate branches, but they remain on different branches the entire time!!)Tested with nightly Rustc on Godbolt Godbolt: https://godbolt.org/z/K5GP5dGq1 Alive2: https://alive2.llvm.org/ce/z/evhsnx