llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.4k stars 12.15k forks source link

Optimizations (regarding Loop Unswitch) that LLVM may have missed #66855

Open ZY546 opened 1 year ago

ZY546 commented 1 year ago

Hello, we found some optimizations (regarding Loop Unswitch) that LLVM may have missed. We would greatly appreicate if you can take a look and let us know you think.

Here are two examples:

Example 1: https://godbolt.org/z/czcPThGrT

extern int var_17;
extern int var_29;
extern int var_32;
extern int var_33;

void test(int var_0, int var_1, int var_2, int var_3, int var_4, int var_5, int var_6, int var_7, int var_8, int var_9, int var_10, int var_11, int var_12, int var_13) {
    for (int i_0 = ((var_9) - (1181835238))/*2*/; i_0 < ((var_4) + (1034010885))/*11*/; i_0 += 1/*1*/) 
    {
        if (var_12)
        {
            var_29 += (var_4 ? (var_2 ? var_0 : var_5) : var_12) ? (var_13 ? var_2 : var_6 ? (-1897050213) : var_6) : (var_1 ? var_5 ? (-296177307) : (-12666761) : 921754019);
            var_32 += (var_12 ? var_0 : var_4) ? (var_2 ? 2097151 : (var_3 ? var_8 : var_0)) : var_2;
            var_33 += ((var_5 ? (-1) : (-1550420726)) ? (var_3 ? 2147483647 : var_6) : 2147483647) ? (var_0 ? (var_5 ? var_8 : var_9) : 699136968) : (var_2 ? var_7 : (var_7 ? -1 : 133169152));
            var_17 += var_13 ? (var_5 ? var_1 : 2147483625) : var_4;
        }

    }
}

Because var_12 is a loop invariant so we can hoist the if condition out of the loop, as a result, the if condition will only be evaluated once. In particular, if var_12 is false, the for loop doesn't need to be executed at all.

But LLVM misses this optimization with a trunk build (https://godbolt.org/z/czcPThGrT). Also, this example works as expected on LLVM 16.

The result of a trunk build:

for.body:
  %i_0.0157 = phi i32 [ %sub, %for.body.lr.ph ], [ %add97, %for.inc ]
  %add29142156 = phi i32 [ %var_29.promoted, %for.body.lr.ph ], [ %add29141, %for.inc ]
  %add50134144155 = phi i32 [ %var_32.promoted, %for.body.lr.ph ], [ %add50134143, %for.inc ]
  %add85149154 = phi i32 [ %var_33.promoted, %for.body.lr.ph ], [ %add85148, %for.inc ]
  %add96151153 = phi i32 [ %var_17.promoted, %for.body.lr.ph ], [ %add96150, %for.inc ]
  br i1 %tobool.not, label %for.inc, label %if.then

if.then:
  %add29 = add nsw i32 %add29142156, %cond28
  store i32 %add29, ptr @var_29, align 4
  br i1 %tobool35.not, label %cond.end48, label %cond.true36
test(int, int, int, int, int, int, int, int, int, int, int, int, int, int):                  # @test(int, int, int, int, int, int, int, int, int, int, int, int, int, int)
        push    rbp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        mov     ebx, dword ptr [rsp + 80]
        lea     eax, [rbx - 1181835238]
        lea     r10d, [r8 + 1034010885]
        cmp     eax, r10d
        jge     .LBB0_13
        mov     eax, dword ptr [rsp + 104]
        mov     ebp, dword ptr [rsp + 72]
        mov     r14d, dword ptr [rsp + 64]
        mov     r11d, dword ptr [rsp + 56]
        mov     r12, qword ptr [rip + var_17@GOTPCREL]
        test    edx, edx
        mov     r13d, edi
        cmove   r13d, r9d
        test    r11d, r11d
        mov     r10d, -1897050213
        cmove   r10d, r11d
        or      r11d, ecx
        test    ecx, ecx
        mov     ecx, ebp
        cmove   ecx, edi
        mov     dword ptr [rsp - 4], ecx        # 4-byte Spill
        xor     r15d, r15d
        mov     ecx, r14d
        neg     ecx
        sbb     r15d, r15d
        or      r15d, 133169152
        mov     dword ptr [rsp - 8], r15d       # 4-byte Spill
        test    r9d, r9d
        mov     ecx, -12666761
        mov     r15d, -296177307
        cmove   r15d, ecx
        cmove   ebp, ebx
        mov     r9d, dword ptr [r12]
        mov     r14d, 2147483625
        cmovne  r14d, esi
        test    esi, esi
        mov     esi, 921754019
        cmovne  esi, r15d
        mov     rcx, qword ptr [rip + var_33@GOTPCREL]
        mov     r12d, dword ptr [rcx]
        cmp     dword ptr [rsp + 112], 0
        cmovne  r10d, edx
        cmove   r14d, r8d
        test    r13d, r13d
        mov     r13, qword ptr [rip + var_32@GOTPCREL]
        cmovne  esi, r10d
        test    r8d, r8d
        cmove   esi, r10d
        test    edi, edi
        mov     r15d, 699136968
        cmovne  r15d, ebp
        mov     ecx, dword ptr [r13]
        sub     r8d, ebx
        mov     rbp, qword ptr [rip + var_29@GOTPCREL]
        add     r8d, -2079121173
        mov     ebx, dword ptr [rbp]
        jmp     .LBB0_2
.LBB0_7:                                # %cond.end48
        add     ecx, edx
        mov     dword ptr [r13], ecx
        test    r11d, r11d
        je      .LBB0_8
.LBB0_10:                               # %cond.true64
        mov     r10d, r15d
.LBB0_11:                               # %cond.end83
        add     r12d, r10d
        mov     r10, qword ptr [rip + var_33@GOTPCREL]
        mov     dword ptr [r10], r12d
        add     r9d, r14d
        mov     r10, qword ptr [rip + var_17@GOTPCREL]
        mov     dword ptr [r10], r9d
.LBB0_12:                               # %for.inc
        dec     r8d
        je      .LBB0_13
.LBB0_2:                                # %for.body
        test    eax, eax
        je      .LBB0_12
        add     ebx, esi
        mov     dword ptr [rbp], ebx
        test    edi, edi
        je      .LBB0_7
        test    edx, edx
        je      .LBB0_5
        add     ecx, 2097151
        mov     dword ptr [r13], ecx
        mov     r10d, dword ptr [rsp + 64]
        test    r11d, r11d
        jne     .LBB0_10
        jmp     .LBB0_11
.LBB0_5:                                # %cond.end48.thread132
        add     ecx, dword ptr [rsp - 4]        # 4-byte Folded Reload
        mov     dword ptr [r13], ecx
        test    r11d, r11d
        jne     .LBB0_10
        jmp     .LBB0_6
.LBB0_8:                                # %cond.false75
        mov     r10d, dword ptr [rsp + 64]
        test    edx, edx
        jne     .LBB0_11
.LBB0_6:                                # %cond.false78
        mov     r10d, dword ptr [rsp - 8]       # 4-byte Reload
        jmp     .LBB0_11
.LBB0_13:                               # %for.cond.cleanup
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret

Example 2: https://godbolt.org/z/1Mz7b7d6a

extern int var_21;
extern int var_25;
extern int var_28;
extern int var_29;

void test(int var_0, int var_1, int var_2, int var_3, int var_4, int var_5, int var_6, int var_7, int var_8, int var_9, int var_10, int var_11, int var_12, int var_13, int var_14, int var_15, int var_16, int var_17) {
    for (int i_0 = 0/*0*/; i_0 < ((((((((var_17)) ? (((819565140) + (var_9))) : ((((var_16)) ? (2141366111) : (var_17)))))) ? (((((((var_4)) ? (var_16) : (var_17)))) ? (779921519) : (((var_1) + (var_8))))) : (((((((var_10)) ? (var_12) : (var_5)))) ? ((((var_15)) ? (var_16) : (var_12))) : ((((var_3)) ? (var_0) : (var_11))))))) - (779921500))/*19*/; i_0 += 4/*4*/) 
    {
        var_21 += var_17;
        if (var_3)
        {
            var_25 += var_17 ? -212529188 : (((var_6) ? (var_5 ? 2029341098 : var_13) : var_11));
            var_28 += var_8 ? ((var_10 ? (var_13 - var_1) : var_14 ? var_17 : var_8)) : (var_6 ? 942541005 : var_1 / var_9);
            var_29 += var_2 ? var_6 : ((var_9 ? var_8 : (var_11 ? var_2 : var_15)));
        }

    }
}

Similarly, var_3 is an loop invariant and should be hoisted out of the loop as well.

The result with a trunk build (main parts):

for.body:
  %i_0.0146 = phi i32 [ 0, %for.body.lr.ph ], [ %add98, %for.inc ]
  %add42130145 = phi i32 [ %var_21.promoted, %for.body.lr.ph ], [ %add42, %for.inc ]
  %add59126132144 = phi i32 [ %var_25.promoted, %for.body.lr.ph ], [ %add59126131, %for.inc ]
  %add81137143 = phi i32 [ %var_28.promoted, %for.body.lr.ph ], [ %add81136, %for.inc ]
  %add97139142 = phi i32 [ %var_29.promoted, %for.body.lr.ph ], [ %add97138, %for.inc ]
  %add42 = add nsw i32 %add42130145, %var_17
  br i1 %tobool33.not, label %for.inc, label %if.then

if.then:
  br i1 %brmerge.not, label %cond.end57.thread, label %cond.end57
.LBB0_10:                               # %for.inc
        add     r11d, r14d
        add     r15d, 4
        cmp     r15d, edi
        jge     .LBB0_5
.LBB0_2:                                # %for.body
        test    r13d, r13d
        je      .LBB0_10

Thank you very much for your time and effort! We look forward to hearing from you.

XChy commented 1 year ago

That's a missed compilation for not hoisting the inner if. An Alive2 proof for the first example.