Open Absoler opened 6 months ago
I observed that this behavior happened after X86 DAG->DAG Instruction Selection: before
define dso_local void @func_bad(i64 %0, i64 %1) local_unnamed_addr #1 {
%3 = load <2 x i64>, ptr @value, align 16
%4 = extractelement <2 x i64> %3, i64 0
%5 = lshr i64 %4, 32
%6 = trunc i64 %5 to i32
store <2 x i64> %3, ptr @global_s, align 16
store i32 %6, ptr @copy, align 4, !tbaa !6
%7 = icmp eq i32 %6, 0
br i1 %7, label %11, label %8
8: ; preds = %2
%9 = load i32, ptr @other, align 4, !tbaa !6
%10 = add nsw i32 %9, -1
store i32 %10, ptr @other, align 4, !tbaa !6
br label %11
11: ; preds = %8, %2
ret void
after:
bb.0 (%ir-block.2):
successors: %bb.2(0x30000000), %bb.1(0x50000000); %bb.2(37.50%), %bb.1(62.50%)
%2:vr128 = MOVAPSrm $rip, 1, $noreg, @value, $noreg :: (dereferenceable load (s128) from @value)
%3:gr64 = MOV64rm $rip, 1, $noreg, @value, $noreg :: (dereferenceable load (s64) from @value, align 16)
%4:gr64 = SHR64ri %3:gr64(tied-def 0), 32, implicit-def dead $eflags
%5:gr32 = COPY %4.sub_32bit:gr64
MOVAPSmr $rip, 1, $noreg, @global_s, $noreg, killed %2:vr128 :: (store (s128) into @global_s)
MOV32mr $rip, 1, $noreg, @copy, $noreg, killed %5:gr32 :: (store (s32) into @copy, !tbaa !6)
TEST64rr %4:gr64, %4:gr64, implicit-def $eflags
JCC_1 %bb.2, 4, implicit $eflags
JMP_1 %bb.1
bb.1 (%ir-block.8):
; predecessors: %bb.0
successors: %bb.2(0x80000000); %bb.2(100.00%)
DEC32m $rip, 1, $noreg, @other, $noreg, implicit-def dead $eflags :: (store (s32) into @other, !tbaa !6), (dereferenceable load (s32) from @other, !tbaa !6)
bb.2 (%ir-block.11):
; predecessors: %bb.0, %bb.1
RET 0
it seems that the translation from extractelement
to MOV64rm
fails to optimize the code
I have found a test case (reduced from https://github.com/llvm/llvm-project/issues/83812) which showes a missed optimization of clang-17.0.6 -O2
clang-17~15 fail to reuse the value loaded from
global_s
while clang-14 could. here's the comparison of generated binaries: clang-17: ( https://godbolt.org/z/Gx98vzoYP )clang-14:
we can see that in clang-17's output the load operation at 0x1147 could be removed, and it's better to reuse the value in %xmm0 just like clang-14's output dose at 0x401128.