[InstCombine] Performance degradation caused by more branch after commit b5a9361 and #66740

cyyself commented 3 months ago

Statement

Commit b5a9361c90ca43c715780ab4f7422fbc9d3a067b and PR #66740 (commit a7f962c00745c8e28991379985fcd6b51ac0d671) caused more likely branches generated. Result in 37% or more end-to-end severe performance degradation on both AMD Zen 4 and Intel Raptor Lake running Verilator generated C++ codes for XiangShan. As we can compare these two results: https://github.com/cyyself/llvm-project/commit/a2489e3a8d85c95ed0adab605311da1339550c7a#commitcomment-144413753 https://github.com/cyyself/llvm-project/commit/40529f8114e2139748ce075539573e36c2bb281b#commitcomment-144413665.

Reduced reproducer

https://godbolt.org/z/TYsnxGf5a

Look at the following code:

struct a_struct {
    bool b_1, b_2, b_3;
    unsigned int cv_1, cv_2, cv_3;
    unsigned int cav_1, cav_2, cav_3;
    unsigned int d;
};

void some_func(a_struct &a) {
    if (a.b_1 & (8U == (0x1f & a.cv_1))) [[unlikely]] {
        a.d = a.cav_1;
    }
    else if (a.b_2 & (7U == (0x1f & a.cv_2))) [[unlikely]] {
        a.d = a.cav_2;
    }
    else if (a.b_3 & (6U == (0x1f & a.cv_3))) [[unlikely]] {
        a.d = a.cav_3;
    }
}

Compile: clang++ -O3 -c test.cpp -S

We will get this code on the x86-64 target on LLVM main (at least for commit da0c8b275564f814a53a5c19497669ae2d99538d):

_Z9some_funcR8a_struct:                 # @_Z9some_funcR8a_struct
    .cfi_startproc
# %bb.0:
    movl    4(%rdi), %eax
    andl    $31, %eax
    cmpl    $8, %eax
    jne .LBB0_3
# %bb.1:
    cmpb    $0, (%rdi)
    jne .LBB0_2
.LBB0_3:
    movl    8(%rdi), %eax
    andl    $31, %eax
    cmpl    $7, %eax
    jne .LBB0_6
...

As we can see, if the condition a.b_1 is false, LLVM will generate a conditional jump to the next if even when we have an unlikely hint. Even worse, these conditional jumps are likely to be taken, causing the CPU to flush the pipeline if the branch predictor isn't working for its small size.

But if we revert commit b5a9361c90ca43c715780ab4f7422fbc9d3a067b and PR #66740 (commit a7f962c00745c8e28991379985fcd6b51ac0d671) or directly use the LLVM release version < 16. We will get the code like this:

_Z9some_funcR8a_struct:                 # @_Z9some_funcR8a_struct
    .cfi_startproc
# %bb.0:
    movl    4(%rdi), %eax
    andl    $31, %eax
    cmpl    $8, %eax
    sete    %al
    testb   %al, (%rdi)
    jne .LBB0_1
# %bb.2:
    movl    8(%rdi), %eax
    andl    $31, %eax
...

It reduced the number of branches by half and is very friendly for CPU branch predictors since there are no likely branches.

dtcxzyw commented 3 months ago

We will get this code on the x86-64 target on LLVM main

I cannot reproduce this with 324fea9baa902b2bff7b644fade080f98a8c543b

 .text
        .file   "test.cpp"
        .globl  _Z9some_funcR8a_struct          # -- Begin function _Z9some_funcR8a_struct
        .p2align        4, 0x90
        .type   _Z9some_funcR8a_struct,@function
_Z9some_funcR8a_struct:                 # @_Z9some_funcR8a_struct
        .cfi_startproc
# %bb.0:                                # %entry
        cmpb    $0, (%rdi)
        je      .LBB0_3
# %bb.1:                                # %entry
        movl    4(%rdi), %eax
        andl    $31, %eax
        cmpl    $8, %eax
        je      .LBB0_2
.LBB0_3:                                # %if.else
        cmpb    $0, 1(%rdi)
        je      .LBB0_6
# %bb.4:                                # %if.else
        movl    8(%rdi), %eax
        andl    $31, %eax
        cmpl    $7, %eax
        je      .LBB0_5
.LBB0_6:                                # %if.else12
        cmpb    $0, 2(%rdi)
        je      .LBB0_10
# %bb.7:                                # %if.else12
        movl    12(%rdi), %eax
        andl    $31, %eax
        cmpl    $6, %eax
        je      .LBB0_8
.LBB0_10:                               # %if.end23
        retq
.LBB0_2:
        movl    $16, %eax
        movl    (%rdi,%rax), %eax
        movl    %eax, 28(%rdi)
        retq
.LBB0_5:
        movl    $20, %eax
        movl    (%rdi,%rax), %eax
        movl    %eax, 28(%rdi)
        retq
.LBB0_8:
        movl    $24, %eax
        movl    (%rdi,%rax), %eax
        movl    %eax, 28(%rdi)
        retq
.Lfunc_end0:
        .size   _Z9some_funcR8a_struct, .Lfunc_end0-_Z9some_funcR8a_struct
        .cfi_endproc
                                        # -- End function
        .ident  "clang version 19.0.0git"
        .section        ".note.GNU-stack","",@progbits
        .addrsig

cyyself commented 3 months ago

We will get this code on the x86-64 target on LLVM main

I cannot reproduce this with 324fea9

Actually, you successfully reproduced. Please look carefully.

dtcxzyw commented 3 months ago

SDAGBuilder converts and/or into branches here:

https://github.com/llvm/llvm-project/blob/9d2f81ea85e410da9d79b9dfad9b9769525e4387/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp#L2801-L2825

dtcxzyw commented 3 months ago

As a workaround, can you try with -mllvm --jump-is-expensive?

cyyself commented 3 months ago

As a workaround, can you try with -mllvm --jump-is-expensive?

Confirmed. The performance of my workload is approximately to the result of reverted these two commit now.

llvm / llvm-project

[InstCombine] Performance degradation caused by more branch after commit b5a9361 and #66740 #99835

Statement

Reduced reproducer