llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.92k stars 11.52k forks source link

[degradation] repeat load operation when returning a struct #84277

Open Absoler opened 6 months ago

Absoler commented 6 months ago

I have found a test case (reduced from https://github.com/llvm/llvm-project/issues/83812) which showes a missed optimization of clang-17.0.6 -O2

struct S0 {
   int  f0;
   int  f1;
   long long  f2;
};

int copy = 0;
struct S0 global_s = {1, 1, 1};
struct S0 value = {2, 2, 2};
int other = 1;

struct S0 get_struct() { return value; }

void func_bad(struct S0 d) {
  struct S0 local_s;
  global_s = local_s = get_struct();
  copy = local_s.f1;
  if (copy)
    --other;
}

clang-17~15 fail to reuse the value loaded from global_s while clang-14 could. here's the comparison of generated binaries: clang-17: ( https://godbolt.org/z/Gx98vzoYP )

0000000000001140 <func_bad>:
    1140:   movaps 0x2ef9(%rip),%xmm0        # 4040 <value>
    1147:   mov    0x2ef2(%rip),%rax        # 4040 <value>
func_bad():
    114e:   shr    $0x20,%rax
    1152:   movaps %xmm0,0x2ed7(%rip)        # 4030 <global_s>
    1159:   mov    %eax,0x2ef9(%rip)        # 4058 <copy>
    115f:   je     1167 <func_bad+0x27>
    1161:   decl   0x2ee9(%rip)        # 4050 <other>

clang-14:

0000000000401120 <func_bad>:
  401120:   movdqa 0x2f18(%rip),%xmm0        # 404040 <value>
func_bad():
  401128:   movq   %xmm0,%rax
  40112d:   shr    $0x20,%rax
  401131:   movdqa %xmm0,0x2ef7(%rip)        # 404030 <global_s>
  401139:   mov    %eax,0x2f19(%rip)        # 404058 <copy>
  40113f:   je     401148 <func_bad+0x28>
  401141:   addl   $0xffffffff,0x2f08(%rip)        # 404050 <other>

we can see that in clang-17's output the load operation at 0x1147 could be removed, and it's better to reuse the value in %xmm0 just like clang-14's output dose at 0x401128.

Absoler commented 6 months ago

I observed that this behavior happened after X86 DAG->DAG Instruction Selection: before

define dso_local void @func_bad(i64 %0, i64 %1) local_unnamed_addr #1 {
  %3 = load <2 x i64>, ptr @value, align 16
  %4 = extractelement <2 x i64> %3, i64 0
  %5 = lshr i64 %4, 32
  %6 = trunc i64 %5 to i32
  store <2 x i64> %3, ptr @global_s, align 16
  store i32 %6, ptr @copy, align 4, !tbaa !6
  %7 = icmp eq i32 %6, 0
  br i1 %7, label %11, label %8

8:                                                ; preds = %2
  %9 = load i32, ptr @other, align 4, !tbaa !6
  %10 = add nsw i32 %9, -1
  store i32 %10, ptr @other, align 4, !tbaa !6
  br label %11

11:                                               ; preds = %8, %2
  ret void

after:

bb.0 (%ir-block.2):
  successors: %bb.2(0x30000000), %bb.1(0x50000000); %bb.2(37.50%), %bb.1(62.50%)

  %2:vr128 = MOVAPSrm $rip, 1, $noreg, @value, $noreg :: (dereferenceable load (s128) from @value)
  %3:gr64 = MOV64rm $rip, 1, $noreg, @value, $noreg :: (dereferenceable load (s64) from @value, align 16)
  %4:gr64 = SHR64ri %3:gr64(tied-def 0), 32, implicit-def dead $eflags
  %5:gr32 = COPY %4.sub_32bit:gr64
  MOVAPSmr $rip, 1, $noreg, @global_s, $noreg, killed %2:vr128 :: (store (s128) into @global_s)
  MOV32mr $rip, 1, $noreg, @copy, $noreg, killed %5:gr32 :: (store (s32) into @copy, !tbaa !6)
  TEST64rr %4:gr64, %4:gr64, implicit-def $eflags
  JCC_1 %bb.2, 4, implicit $eflags
  JMP_1 %bb.1

bb.1 (%ir-block.8):
; predecessors: %bb.0
  successors: %bb.2(0x80000000); %bb.2(100.00%)

  DEC32m $rip, 1, $noreg, @other, $noreg, implicit-def dead $eflags :: (store (s32) into @other, !tbaa !6), (dereferenceable load (s32) from @other, !tbaa !6)

bb.2 (%ir-block.11):
; predecessors: %bb.0, %bb.1

  RET 0

it seems that the translation from extractelement to MOV64rm fails to optimize the code