llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.55k stars 11.8k forks source link

[AArch64] Missed if-conversion and vectorisation opportunity (tsvc, s124) #71521

Open sjoerdmeijer opened 11 months ago

sjoerdmeijer commented 11 months ago

We are generating a lot of code with Clang for a loop that contains an if-then statement resulting in predicated instructions, which don't seem to be necessary looking at GCC's codegen. For this kernel s124 in TSVC, we are about 60% behind.

Compile this input with -O3 -mcpu=neoverse-v2 -ffast-math:

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s124(struct args_t * func_args)
{
    int j;
    for (int nl = 0; nl < 100000; nl++) {
        j = -1;
        for (int i = 0; i < 32000; i++) {
            if (b[i] > (float)0.) {
                j++;
                a[j] = b[i] + d[i] * e[i];
            } else {
                j++;
                a[j] = c[i] + d[i] * e[i];
            }
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
}

Clang's codegen:

.LBB0_3:                                //   Parent Loop BB0_2 Depth=1
        ld1w    { z0.s }, p2/z, [x12, x9, lsl #2]
        ld1w    { z1.s }, p2/z, [x25, x9, lsl #2]
        ld1w    { z2.s }, p2/z, [x24, x9, lsl #2]
        asr     x10, x8, #30
        add     x8, x8, x20
        add     x11, x23, x10
        ld1w    { z3.s }, p2/z, [x26, x9, lsl #2]
        ld1w    { z4.s }, p2/z, [x22, x9, lsl #2]
        ld1w    { z5.s }, p2/z, [x19, x9, lsl #2]
        fcmgt   p0.s, p2/z, z0.s, #0.0
        fcmgt   p1.s, p2/z, z1.s, #0.0
        sel     z0.s, p0, z0.s, z2.s
        sel     z1.s, p1, z1.s, z3.s
        ld1w    { z2.s }, p2/z, [x21, x9, lsl #2]
        ld1w    { z3.s }, p2/z, [x27, x9, lsl #2]
        add     x9, x9, x15
        cmp     x28, x9
        fmla    z0.s, p2/m, z4.s, z2.s
        fmla    z1.s, p2/m, z5.s, z3.s
        st1b    { z0.b }, p3, [x23, x10]
        st1w    { z1.s }, p2, [x11, x14, lsl #2]
        b.ne    .LBB0_3

vs. GCC's codegen:

.L2:
        ldr     q30, [x23, x0]
        ldr     q27, [x22, x0]
        ldr     q28, [x21, x0]
        ldr     q31, [x20, x0]
        fcmgt   v29.4s, v30.4s, 0
        fmla    v30.4s, v27.4s, v28.4s
        fmla    v31.4s, v27.4s, v28.4s
        bit     v31.16b, v30.16b, v29.16b
        str     q31, [x19, x0]
        add     x0, x0, 16
        cmp     x0, x24
        bne     .L2

See also: https://godbolt.org/z/nb6xYxxKo

llvmbot commented 11 months ago

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

We are generating a lot of code with Clang for a loop that contains an if-then statement resulting in predicated instructions, which don't seem to be necessary looking at GCC's codegen. For this kernel s124 in TSVC, we are about 60% behind. Compile this input with `-O3 -mcpu=neoverse-v2 -ffast-math`: ``` __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s124(struct args_t * func_args) { int j; for (int nl = 0; nl < 100000; nl++) { j = -1; for (int i = 0; i < 32000; i++) { if (b[i] > (float)0.) { j++; a[j] = b[i] + d[i] * e[i]; } else { j++; a[j] = c[i] + d[i] * e[i]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } } ``` Clang's codegen: ``` .LBB0_3: // Parent Loop BB0_2 Depth=1 ld1w { z0.s }, p2/z, [x12, x9, lsl #2] ld1w { z1.s }, p2/z, [x25, x9, lsl #2] ld1w { z2.s }, p2/z, [x24, x9, lsl #2] asr x10, x8, #30 add x8, x8, x20 add x11, x23, x10 ld1w { z3.s }, p2/z, [x26, x9, lsl #2] ld1w { z4.s }, p2/z, [x22, x9, lsl #2] ld1w { z5.s }, p2/z, [x19, x9, lsl #2] fcmgt p0.s, p2/z, z0.s, #0.0 fcmgt p1.s, p2/z, z1.s, #0.0 sel z0.s, p0, z0.s, z2.s sel z1.s, p1, z1.s, z3.s ld1w { z2.s }, p2/z, [x21, x9, lsl #2] ld1w { z3.s }, p2/z, [x27, x9, lsl #2] add x9, x9, x15 cmp x28, x9 fmla z0.s, p2/m, z4.s, z2.s fmla z1.s, p2/m, z5.s, z3.s st1b { z0.b }, p3, [x23, x10] st1w { z1.s }, p2, [x11, x14, lsl #2] b.ne .LBB0_3 ``` vs. GCC's codegen: ``` .L2: ldr q30, [x23, x0] ldr q27, [x22, x0] ldr q28, [x21, x0] ldr q31, [x20, x0] fcmgt v29.4s, v30.4s, 0 fmla v30.4s, v27.4s, v28.4s fmla v31.4s, v27.4s, v28.4s bit v31.16b, v30.16b, v29.16b str q31, [x19, x0] add x0, x0, 16 cmp x0, x24 bne .L2 ``` See also: https://godbolt.org/z/nb6xYxxKo