[aarch64] Spurious optimization of `cmtst+bif+bif` to `shl+cmlt+bif+shl+cmlt+bif`

llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.

Other

27.81k stars 11.45k forks source link

https://zig.godbolt.org/z/Mf86PTv48

const std = @import("std");

fn expand8xu8To16xu4AsByteVector(vec: @Vector(8, u8)) @Vector(16, u8) {
    return std.simd.interlace(.{ vec & @as(@Vector(8, u8), @splat(0xF)), vec >> @splat(4) });
}

fn sel(vec: anytype) @Vector(@typeInfo(@TypeOf(vec)).vector.len, u8) {
    const false_vec: @Vector(16, u8) = @splat(0);
    const true_vec = ~false_vec;
    return @select(u8, vec, true_vec, false_vec);
}

fn bsl2(vec1: anytype, vec2: @TypeOf(vec1), vec3: @TypeOf(vec1)) @TypeOf(vec1) {
    return (vec1 & (vec2 ^ vec3)) ^ vec3;
}

fn bsl(vec1: anytype, vec2: @TypeOf(vec1), vec3: @TypeOf(vec1)) @TypeOf(vec1) {
    return (vec1 & vec2) | (~vec1 & vec3);
}

export fn bad_bsl(x: u64, buffer1_half1: @Vector(16, u8), buffer1_half2: @Vector(16, u8), buffer2_half1: @Vector(16, u8), buffer2_half2: @Vector(16, u8)) @Vector(32, u8) {
    const splatted = blk: {
        // try to avoid https://github.com/llvm/llvm-project/issues/92211
        var y: u64 = x; // spread out each bit of `x` into a nibble of `y`
        // zig fmt: off
        // start positions:   0b0000000000000000000000000000000000000000000000001111111111111111;
        y = (y | (y << 24)); //  & 0b0000000000000000000000001111111100000000000000000000000011111111 this AND is optimized out
        y = (y | (y << 12)) & 0b0000000000001111000000000000111100000000000011110000000000001111;
        y = (y | (y <<  6)); //  & 0b0000001100000011000000110000001100000011000000110000001100000011 this AND is optimized out
        y = (y | (y <<  3)) & 0b0001000100010001000100010001000100010001000100010001000100010001;
        // zig fmt: on
        break :blk y;
    };

    const selector_compressed = sel(expand8xu8To16xu4AsByteVector(@bitCast(splatted)) != @as(@Vector(16, u8), @splat(0)));
    const selectors: [2]@Vector(16, u8) = @bitCast(std.simd.interlace(.{ selector_compressed, selector_compressed }));

    return @bitCast([2]@Vector(16, u8){
        bsl(
            selectors[0],
            buffer1_half1,
            buffer2_half1,
        ),
        bsl(
            selectors[1],
            buffer1_half2,
            buffer2_half2,
        ),
    });
}

Compiled on aarch64:

bad_bsl:
        orr     x9, x0, x0, lsl #24
        orr     x9, x9, x9, lsl #12
        and     x9, x9, #0xf000f000f000f
        orr     x9, x9, x9, lsl #6
        orr     x9, x9, x9, lsl #3
        and     x9, x9, #0x1111111111111111
        fmov    d4, x9
        movi    v5.8b, #15
        and     v5.8b, v4.8b, v5.8b
        ushr    v4.8b, v4.8b, #4
        zip1    v4.16b, v5.16b, v4.16b
        zip2    v5.16b, v4.16b, v4.16b
        zip1    v4.16b, v4.16b, v4.16b
        shl     v4.16b, v4.16b, #7
        cmlt    v4.16b, v4.16b, #0
        bif     v0.16b, v2.16b, v4.16b
        shl     v2.16b, v5.16b, #7
        cmlt    v2.16b, v2.16b, #0
        bif     v1.16b, v3.16b, v2.16b
        stp     q0, q1, [x8]
        ret

If you switch the bsl <--> bsl2 names, you get:

bad_bsl:
        orr     x9, x0, x0, lsl #24
        orr     x9, x9, x9, lsl #12
        and     x9, x9, #0xf000f000f000f
        orr     x9, x9, x9, lsl #6
        orr     x9, x9, x9, lsl #3
        and     x9, x9, #0x1111111111111111
        fmov    d4, x9
        movi    v5.8b, #15
        and     v5.8b, v4.8b, v5.8b
        ushr    v4.8b, v4.8b, #4
        zip1    v4.16b, v5.16b, v4.16b
        cmtst   v4.16b, v4.16b, v4.16b
        zip2    v5.16b, v4.16b, v4.16b
        zip1    v4.16b, v4.16b, v4.16b
        bif     v0.16b, v2.16b, v4.16b
        bif     v1.16b, v3.16b, v5.16b
        stp     q0, q1, [x8]
        ret

arm32 has the same problem with the and/andn definition of bsl. The XOR definition would probably have the same problem once https://github.com/llvm/llvm-project/issues/92267 is resolved.

bad_bsl:
        push    {r11, lr}
        lsl     r1, r3, #24
        orr     r12, r2, r2, lsl #12
        vmov.i8 d16, #0xf
        orr     r1, r1, r2, lsr #8
        movw    lr, #15
        movt    lr, #15
        orr     r1, r1, r3
        lsl     r3, r1, #12
        orr     r2, r3, r2, lsr #20
        and     r3, r12, lr
        orr     r3, r3, r3, lsl #6
        orr     r1, r2, r1
        and     r1, r1, lr
        orr     r2, r3, r3, lsl #3
        orr     r1, r1, r1, lsl #6
        movw    r3, #4369
        orr     r1, r1, r1, lsl #3
        movt    r3, #4369
        and     r2, r2, r3
        and     r1, r1, r3
        vmov    d17, r2, r1
        add     r1, sp, #40
        vand    d18, d17, d16
        vshr.u8 d19, d17, #4
        vld1.64 {d20, d21}, [r1]
        add     r1, sp, #56
        vld1.64 {d22, d23}, [r1]
        add     r1, sp, #8
        vzip.8  d18, d19
        vorr    q8, q9, q9
        vld1.64 {d24, d25}, [r1]
        add     r1, sp, #24
        vzip.8  q9, q8
        vshl.i8 q9, q9, #7
        vshl.i8 q8, q8, #7
        vshr.s8 q9, q9, #7
        vshr.s8 q8, q8, #7
        vbsl    q9, q12, q10
        vld1.64 {d20, d21}, [r1]
        vbsl    q8, q10, q11
        vst1.8  {d18, d19}, [r0:128]!
        vst1.64 {d16, d17}, [r0:128]
        pop     {r11, pc}

define dso_local void @bad_bsl(ptr noalias nocapture nonnull writeonly sret(<32 x i8>) %0, i64 %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, <16 x i8> %5) local_unnamed_addr {
Entry:
  %6 = shl i64 %1, 24
  %7 = or i64 %6, %1
  %8 = shl i64 %7, 12
  %9 = or i64 %8, %7
  %10 = and i64 %9, 4222189076152335
  %11 = mul nuw nsw i64 %10, 65
  %12 = mul nuw nsw i64 %10, 520
  %13 = or i64 %12, %11
  %14 = and i64 %13, 1229782938247303441
  %15 = bitcast i64 %14 to <8 x i8>
  %16 = and <8 x i8> %15, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
  %17 = lshr <8 x i8> %15, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
  %18 = shufflevector <8 x i8> %16, <8 x i8> %17, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  %19 = icmp ne <16 x i8> %18, zeroinitializer
  %20 = zext <16 x i1> %19 to <16 x i8>
  %21 = shufflevector <16 x i8> %20, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
  %.sroa.09.0.vec.extract = shufflevector <32 x i8> %21, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %22 = trunc <16 x i8> %.sroa.09.0.vec.extract to <16 x i1>
  %23 = select <16 x i1> %22, <16 x i8> %2, <16 x i8> %4
  %.sroa.09.16.vec.extract = shufflevector <32 x i8> %21, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %24 = trunc <16 x i8> %.sroa.09.16.vec.extract to <16 x i1>
  %25 = select <16 x i1> %24, <16 x i8> %3, <16 x i8> %5
  %.sroa.010.0.vecblend = shufflevector <16 x i8> %23, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %.sroa.010.16.vec.expand = shufflevector <16 x i8> %25, <16 x i8> poison, <32 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %.sroa.010.16.vecblend = shufflevector <32 x i8> %.sroa.010.0.vecblend, <32 x i8> %.sroa.010.16.vec.expand, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  store <32 x i8> %.sroa.010.16.vecblend, ptr %0, align 32
  ret void
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

declare void @llvm.dbg.declare(metadata, metadata, metadata) #1

@llvm/issue-subscribers-backend-aarch64

Author: Niles Salter (Validark)

https://zig.godbolt.org/z/Tqb34G6h4 ```zig const std = @import("std"); fn expand8xu8To16xu4AsByteVector(vec: @Vector(8, u8)) @Vector(16, u8) { return std.simd.interlace(.{ vec & @as(@Vector(8, u8), @splat(0xF)), vec >> @splat(4) }); } fn sel(vec: anytype) @Vector(@typeInfo(@TypeOf(vec)).Vector.len, u8) { const false_vec: @Vector(16, u8) = @splat(0); const true_vec = ~false_vec; return @select(u8, vec, true_vec, false_vec); } fn bsl2(vec1: anytype, vec2: @TypeOf(vec1), vec3: @TypeOf(vec1)) @TypeOf(vec1) { return (vec1 & (vec2 ^ vec3)) ^ vec3; } fn bsl(vec1: anytype, vec2: @TypeOf(vec1), vec3: @TypeOf(vec1)) @TypeOf(vec1) { return (vec1 & vec2) | (~vec1 & vec3); } export fn bad_bsl(x: u64, buffer1_half1: @Vector(16, u8), buffer1_half2: @Vector(16, u8), buffer2_half1: @Vector(16, u8), buffer2_half2: @Vector(16, u8)) @Vector(32, u8) { const splatted = blk: { // try to avoid https://github.com/llvm/llvm-project/issues/92211 var y: u64 = x; // spread out each bit of `x` into a nibble of `y` // zig fmt: off // start positions: 0b0000000000000000000000000000000000000000000000001111111111111111; y = (y | (y << 24)); // & 0b0000000000000000000000001111111100000000000000000000000011111111 this AND is optimized out y = (y | (y << 12)) & 0b0000000000001111000000000000111100000000000011110000000000001111; y = (y | (y << 6)); // & 0b0000001100000011000000110000001100000011000000110000001100000011 this AND is optimized out y = (y | (y << 3)) & 0b0001000100010001000100010001000100010001000100010001000100010001; // zig fmt: on break :blk y; }; const selector_compressed = sel(expand8xu8To16xu4AsByteVector(@bitCast(splatted)) != @as(@Vector(16, u8), @splat(0))); const selectors: [2]@Vector(16, u8) = @bitCast(std.simd.interlace(.{ selector_compressed, selector_compressed })); return @bitCast([2]@Vector(16, u8){ bsl( selectors[0], buffer1_half1, buffer2_half1, ), bsl( selectors[1], buffer1_half2, buffer2_half2, ), }); } ``` Compiled on aarch64: ```asm bad_bsl: orr x9, x0, x0, lsl #24 orr x9, x9, x9, lsl #12 and x9, x9, #0xf000f000f000f orr x9, x9, x9, lsl #6 orr x9, x9, x9, lsl #3 and x9, x9, #0x1111111111111111 fmov d4, x9 movi v5.8b, #15 and v5.8b, v4.8b, v5.8b ushr v4.8b, v4.8b, #4 zip1 v4.16b, v5.16b, v4.16b zip2 v5.16b, v4.16b, v4.16b zip1 v4.16b, v4.16b, v4.16b shl v4.16b, v4.16b, #7 cmlt v4.16b, v4.16b, #0 bif v0.16b, v2.16b, v4.16b shl v2.16b, v5.16b, #7 cmlt v2.16b, v2.16b, #0 bif v1.16b, v3.16b, v2.16b stp q0, q1, [x8] ret ``` If you switch the `bsl` <--> `bsl2` names, you get: ```asm bad_bsl: orr x9, x0, x0, lsl #24 orr x9, x9, x9, lsl #12 and x9, x9, #0xf000f000f000f orr x9, x9, x9, lsl #6 orr x9, x9, x9, lsl #3 and x9, x9, #0x1111111111111111 fmov d4, x9 movi v5.8b, #15 and v5.8b, v4.8b, v5.8b ushr v4.8b, v4.8b, #4 zip1 v4.16b, v5.16b, v4.16b cmtst v4.16b, v4.16b, v4.16b zip2 v5.16b, v4.16b, v4.16b zip1 v4.16b, v4.16b, v4.16b bif v0.16b, v2.16b, v4.16b bif v1.16b, v3.16b, v5.16b stp q0, q1, [x8] ret ``` arm32 has the same problem with the `and/andn` definition of `bsl`. The `XOR` definition would probably have the same problem once https://github.com/llvm/llvm-project/issues/92267 is resolved. ```asm bad_bsl: push {r11, lr} lsl r1, r3, #24 orr r12, r2, r2, lsl #12 vmov.i8 d16, #0xf orr r1, r1, r2, lsr #8 movw lr, #15 movt lr, #15 orr r1, r1, r3 lsl r3, r1, #12 orr r2, r3, r2, lsr #20 and r3, r12, lr orr r3, r3, r3, lsl #6 orr r1, r2, r1 and r1, r1, lr orr r2, r3, r3, lsl #3 orr r1, r1, r1, lsl #6 movw r3, #4369 orr r1, r1, r1, lsl #3 movt r3, #4369 and r2, r2, r3 and r1, r1, r3 vmov d17, r2, r1 add r1, sp, #40 vand d18, d17, d16 vshr.u8 d19, d17, #4 vld1.64 {d20, d21}, [r1] add r1, sp, #56 vld1.64 {d22, d23}, [r1] add r1, sp, #8 vzip.8 d18, d19 vorr q8, q9, q9 vld1.64 {d24, d25}, [r1] add r1, sp, #24 vzip.8 q9, q8 vshl.i8 q9, q9, #7 vshl.i8 q8, q8, #7 vshr.s8 q9, q9, #7 vshr.s8 q8, q8, #7 vbsl q9, q12, q10 vld1.64 {d20, d21}, [r1] vbsl q8, q10, q11 vst1.8 {d18, d19}, [r0:128]! vst1.64 {d16, d17}, [r0:128] pop {r11, pc} ``` ```llvm define dso_local void @bad_bsl(ptr noalias nocapture nonnull writeonly sret(<32 x i8>) %0, i64 %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, <16 x i8> %5) local_unnamed_addr { Entry: %6 = shl i64 %1, 24 %7 = or i64 %6, %1 %8 = shl i64 %7, 12 %9 = or i64 %8, %7 %10 = and i64 %9, 4222189076152335 %11 = mul nuw nsw i64 %10, 65 %12 = mul nuw nsw i64 %10, 520 %13 = or i64 %12, %11 %14 = and i64 %13, 1229782938247303441 %15 = bitcast i64 %14 to <8 x i8> %16 = and <8 x i8> %15, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15> %17 = lshr <8 x i8> %15, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> %18 = shufflevector <8 x i8> %16, <8 x i8> %17, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> %19 = icmp ne <16 x i8> %18, zeroinitializer %20 = zext <16 x i1> %19 to <16 x i8> %21 = shufflevector <16 x i8> %20, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15> %.sroa.09.0.vec.extract = shufflevector <32 x i8> %21, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %22 = trunc <16 x i8> %.sroa.09.0.vec.extract to <16 x i1> %23 = select <16 x i1> %22, <16 x i8> %2, <16 x i8> %4 %.sroa.09.16.vec.extract = shufflevector <32 x i8> %21, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> %24 = trunc <16 x i8> %.sroa.09.16.vec.extract to <16 x i1> %25 = select <16 x i1> %24, <16 x i8> %3, <16 x i8> %5 %.sroa.010.0.vecblend = shufflevector <16 x i8> %23, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %.sroa.010.16.vec.expand = shufflevector <16 x i8> %25, <16 x i8> poison, <32 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %.sroa.010.16.vecblend = shufflevector <32 x i8> %.sroa.010.0.vecblend, <32 x i8> %.sroa.010.16.vec.expand, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> store <32 x i8> %.sroa.010.16.vecblend, ptr %0, align 32 ret void } declare void @llvm.dbg.value(metadata, metadata, metadata) #1 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 ```

llvm / llvm-project

[aarch64] Spurious optimization of `cmtst+bif+bif` to `shl+cmlt+bif+shl+cmlt+bif` #92269