llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.92k stars 11.52k forks source link

Identical instructions after non-taken conditional jump and at start of taken jump should be hoisted before conditional jump #104646

Open Validark opened 3 weeks ago

Validark commented 3 weeks ago

I have encountered this many times, but here's how I triggered this behavior recently:

export fn foo(
    a: u64,
    b: u64,
    c: u64,
    d: u64,
    e: u64,
    f: u64,
) @Vector(4, u64) {
    var all_ored = a | b | c | d;

    var tween_bits = @as(u64, 0);
    var start_bits = @as(u64, 0);
    var end_bits = @as(u64, 0);

    while (all_ored != 0) {
        const start_bit = all_ored & (~all_ored +% 1); // the lowest set bit of all bitstrings in vec
        start_bits |= start_bit;
        const after_start_bit = ~blsmsk(all_ored); // all bits after the start_bit (exclusive)

        var target_with_end_bit: u64 = if ((a & start_bit) != 0)
            a
        else if ((b & start_bit) != 0)
            b
        else
            e;

        target_with_end_bit ^= f;
        target_with_end_bit &= after_start_bit;
        const end_bit = target_with_end_bit & (~target_with_end_bit +% 1);
        const after_end_bit = ~blsmsk(target_with_end_bit);
        end_bits |= end_bit;
        tween_bits |= ~(~end_bit +% (start_bit << 1));
        all_ored &= after_end_bit;
    }

    const tween_bits_vec: @Vector(4, u64) = @splat(tween_bits);
    const vec = @as(@Vector(4, u64), .{ a, b, c, d }) & ~tween_bits_vec;
    const final_vec = ((((vec << @splat(1)) & tween_bits_vec) +% tween_bits_vec) ^ tween_bits_vec) | vec;
    return final_vec;
}

fn blsmsk(x: u64) u64 {
    return (x ^ (x - 1));
}
foo:
        mov     rax, rsi
        or      rax, rdi
        or      rax, rdx
        or      rax, rcx
        je      .LBB0_1
        push    r15
        push    r14
        push    rbx
        xor     r10d, r10d
.LBB0_4:
        mov     r11, rax
        neg     r11
        mov     rbx, rax
        mov     r14, rsi
        and     rbx, r11
        xor     r11, rax
        test    rbx, rsi
        cmove   r14, r8
        test    rbx, rdi
        cmovne  r14, rdi
        add     rbx, rbx
        xor     r14, r9
        and     r14, r11
        mov     r11, r14
        neg     r11
        mov     r15, r14
        and     r15, r11
        xor     r11, r14
        sub     r15, rbx
        or      r10, r15
        and     rax, r11
        jne     .LBB0_4
        pop     rbx
        pop     r14
        pop     r15
        jmp     .LBB0_2
.LBB0_1:
        xor     r10d, r10d
.LBB0_2:
        vmovq   xmm0, rcx
        vmovq   xmm2, rdx
        vmovq   xmm3, rdi
        vpbroadcastq    ymm1, r10
        vpunpcklqdq     xmm0, xmm2, xmm0
        vmovq   xmm2, rsi
        vpunpcklqdq     xmm2, xmm3, xmm2
        vinserti128     ymm0, ymm2, xmm0, 1
        vpandn  ymm2, ymm1, ymm0
        vpaddq  ymm0, ymm2, ymm2
        vpand   ymm0, ymm0, ymm1
        vpaddq  ymm0, ymm0, ymm1
        vpternlogq      ymm0, ymm2, ymm1, 222
        ret
foo:
        mov     rax, rsi
        or      rax, rdi
        or      rax, rdx
        or      rax, rcx
+       xor     r10d, r10d
-       je      .LBB0_1
+       je      .LBB0_2
        push    r15
        push    r14
        push    rbx
-       xor     r10d, r10d
.LBB0_4:
        mov     r11, rax
        neg     r11
        mov     rbx, rax
        mov     r14, rsi
        and     rbx, r11
        xor     r11, rax
        test    rbx, rsi
        cmove   r14, r8
        test    rbx, rdi
        cmovne  r14, rdi
        add     rbx, rbx
        xor     r14, r9
        and     r14, r11
        mov     r11, r14
        neg     r11
        mov     r15, r14
        and     r15, r11
        xor     r11, r14
        sub     r15, rbx
        or      r10, r15
        and     rax, r11
        jne     .LBB0_4
        pop     rbx
        pop     r14
        pop     r15
-       jmp     .LBB0_2
-.LBB0_1:
-       xor     r10d, r10d
.LBB0_2:
        vmovq   xmm0, rcx
        vmovq   xmm2, rdx
        vmovq   xmm3, rdi
        vpbroadcastq    ymm1, r10
        vpunpcklqdq     xmm0, xmm2, xmm0
        vmovq   xmm2, rsi
        vpunpcklqdq     xmm2, xmm3, xmm2
        vinserti128     ymm0, ymm2, xmm0, 1
        vpandn  ymm2, ymm1, ymm0
        vpaddq  ymm0, ymm2, ymm2
        vpand   ymm0, ymm0, ymm1
        vpaddq  ymm0, ymm0, ymm1
        vpternlogq      ymm0, ymm2, ymm1, 222
        ret

Zig godbolt link

; ModuleID = 'BitcodeBuffer'
source_filename = "llvm_code"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-musl"

; Function Attrs: nofree norecurse nosync nounwind memory(none) uwtable
define dso_local <4 x i64> @foo(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) local_unnamed_addr #0 {
  %7 = or i64 %1, %0
  %8 = or i64 %7, %2
  %9 = or i64 %8, %3
  %.not16 = icmp eq i64 %9, 0
  br i1 %.not16, label %._crit_edge, label %.lr.ph.preheader

.lr.ph.preheader:                                 ; preds = %6
  br label %.lr.ph

.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
  %.sroa.0.018 = phi i64 [ %24, %.lr.ph ], [ %9, %.lr.ph.preheader ]
  %.sroa.05.017 = phi i64 [ %23, %.lr.ph ], [ 0, %.lr.ph.preheader ]
  %10 = sub i64 0, %.sroa.0.018
  %11 = and i64 %.sroa.0.018, %10
  %12 = xor i64 %.sroa.0.018, %10
  %13 = and i64 %11, %0
  %.not14 = icmp eq i64 %13, 0
  %14 = and i64 %11, %1
  %.not15 = icmp eq i64 %14, 0
  %. = select i1 %.not15, i64 %4, i64 %1
  %15 = select i1 %.not14, i64 %., i64 %0
  %16 = xor i64 %15, %5
  %17 = and i64 %16, %12
  %18 = sub i64 0, %17
  %19 = and i64 %17, %18
  %20 = xor i64 %17, %18
  %21 = shl i64 %11, 1
  %22 = sub i64 %19, %21
  %23 = or i64 %22, %.sroa.05.017
  %24 = and i64 %20, %.sroa.0.018
  %.not = icmp eq i64 %24, 0
  br i1 %.not, label %._crit_edge, label %.lr.ph

._crit_edge:                                      ; preds = %.lr.ph, %6
  %.sroa.05.0.lcssa = phi i64 [ 0, %6 ], [ %23, %.lr.ph ]
  %25 = insertelement <1 x i64> poison, i64 %.sroa.05.0.lcssa, i64 0
  %26 = shufflevector <1 x i64> %25, <1 x i64> poison, <4 x i32> zeroinitializer
  %27 = insertelement <4 x i64> poison, i64 %0, i64 0
  %28 = insertelement <4 x i64> %27, i64 %1, i64 1
  %29 = insertelement <4 x i64> %28, i64 %2, i64 2
  %30 = insertelement <4 x i64> %29, i64 %3, i64 3
  %31 = xor <4 x i64> %26, <i64 -1, i64 -1, i64 -1, i64 -1>
  %32 = and <4 x i64> %30, %31
  %33 = shl <4 x i64> %32, <i64 1, i64 1, i64 1, i64 1>
  %34 = and <4 x i64> %33, %26
  %35 = add <4 x i64> %34, %26
  %36 = xor <4 x i64> %35, %26
  %37 = or <4 x i64> %36, %32
  ret <4 x i64> %37
}

attributes #0 = { nofree norecurse nosync nounwind memory(none) uwtable "frame-pointer"="none" "target-cpu"="znver4" "target-features"="-16bit-mode,-32bit-mode,-3dnow,-3dnowa,+64bit,+adx,+aes,+allow-light-256-bit,-amx-bf16,-amx-complex,-amx-fp16,-amx-int8,-amx-tile,+avx,-avx10.1-256,-avx10.1-512,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,-avx512er,+avx512f,-avx512fp16,+avx512ifma,-avx512pf,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,-avx512vp2intersect,+avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,+bmi,+bmi2,+branchfusion,-ccmp,-cf,-cldemote,+clflushopt,+clwb,+clzero,+cmov,-cmpccxadd,+crc32,+cx16,+cx8,-egpr,-enqcmd,-ermsb,+evex512,+f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,+fast-15bytenop,-fast-7bytenop,+fast-bextr,-fast-gather,-fast-hops,+fast-lzcnt,+fast-movbe,+fast-scalar-fsqrt,+fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle,+fast-vector-fsqrt,-fast-vector-shift-masks,-faster-shift-than-shuffle,+fma,-fma4,+fsgsbase,+fsrm,+fxsr,+gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,-idivq-to-divl,+invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,+lzcnt,+macrofusion,+mmx,+movbe,-movdir64b,-movdiri,+mwaitx,-ndd,-no-bypass-delay,-no-bypass-delay-blend,-no-bypass-delay-mov,-no-bypass-delay-shuffle,+nopl,-pad-short-functions,+pclmul,-pconfig,+pku,+popcnt,-ppx,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefer-movmsk-over-vtest,-prefer-no-gather,-prefer-no-scatter,-prefetchi,-prefetchwt1,+prfchw,-ptwrite,-push2pop2,-raoint,+rdpid,+rdpru,+rdrnd,+rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,+sahf,+sbb-dep-breaking,-serialize,-seses,-sgx,+sha,-sha512,+shstk,-slow-3ops-lea,-slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,+slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,-sm3,-sm4,-soft-float,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,-sse-unaligned-mem,+ssse3,-tagged-globals,-tbm,-tsxldtrk,-tuning-fast-imm-vector-shift,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-usermsr,+vaes,+vpclmulqdq,+vzeroupper,-waitpkg,+wbnoinvd,-widekl,+x87,-xop,+xsave,+xsavec,+xsaveopt,+xsaves" }

LLVM godbolt link

topperc commented 3 weeks ago
foo:
        mov     rax, rsi
        or      rax, rdi
        or      rax, rdx
        or      rax, rcx
+       xor     r10d, r10d
-       je      .LBB0_1
+       je      .LBB0_2
        push    r15
        push    r14
        push    rbx
-       xor     r10d, r10d
.LBB0_4:

xor overwrites the flags need by the je, but it could be placed above the or

Validark commented 3 weeks ago

xor overwrites the flags need by the je, but it could be placed above the or

Good point! I didn't even think about that, haha.