[AArch64] VLA slower than VLS (tsvc, s1111)

sjoerdmeijer commented 10 months ago

We are about 25% behind with Clang compared to GCC12 on Grace for kernel s1111 in TSVC. The difference seems to be related to Clang generating a VLA loop, and GCC a simpler VLS loop.

Compile this input with -O3 -mcpu=neoverse-v2 -ffast-math:

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s1111()
{
    for (int nl = 0; nl < 2*100000; nl++) {
        for (int i = 0; i < 32000/2; i++) {
            a[2*i] = c[i] * b[i] + d[i] * b[i] + c[i] * c[i] + d[i] * b[i] + d[i] * c[i];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
}

Clang's codegen:

.LBB0_3:                                //   Parent Loop BB0_2 Depth=1
        ld1w    { z4.s }, p0/z, [x19, x8, lsl #2]
        ld1w    { z2.s }, p0/z, [x21, x8, lsl #2]
        ld1w    { z3.s }, p0/z, [x20, x8, lsl #2]
        add     x8, x8, x23
        cmp     x28, x8
        fadd    z5.s, z4.s, z4.s
        fmul    z5.s, z3.s, z5.s
        fadd    z3.s, z3.s, z2.s
        fadd    z3.s, z3.s, z4.s
        lsl     z4.d, z0.d, #1
        add     z0.d, z0.d, z6.d
        fmad    z2.s, p0/m, z3.s, z5.s
        lsl     z3.d, z1.d, #1
        add     z1.d, z1.d, z6.d
        uunpklo z5.d, z2.s
        uunpkhi z2.d, z2.s
        st1w    { z5.d }, p1, [x22, z4.d, lsl #2]
        st1w    { z2.d }, p1, [x22, z3.d, lsl #2]
        b.ne    .LBB0_3

vs. GCC's codegen:

.L3:
        ldr     q29, [x25, x0]
        mov     x7, x4
        add     x6, x4, 16
        add     x5, x4, 24
        add     x4, x4, 32
        ldr     q30, [x24, x0]
        ldr     q28, [x23, x0]
        add     x0, x0, 16
        mov     v31.16b, v29.16b
        fmla    v31.4s, v30.4s, v27.4s
        fadd    v30.4s, v30.4s, v29.4s
        fmul    v31.4s, v31.4s, v28.4s
        fmla    v31.4s, v30.4s, v29.4s
        str     s31, [x7], 8
        st1     {v31.s}[1], [x7]
        st1     {v31.s}[2], [x6]
        st1     {v31.s}[3], [x5]
        cmp     x0, x26
        bne     .L3

TODO: Root cause analysis.

llvmbot commented 10 months ago

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

We are about 25% behind with Clang compared to GCC12 on Grace for kernel s1111 in TSVC. The difference seems to be related to Clang generating a VLA loop, and GCC a simpler VLS loop. Compile this input with `-O3 -mcpu=neoverse-v2 -ffast-math`: ``` __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s1111() { for (int nl = 0; nl < 2*100000; nl++) { for (int i = 0; i < 32000/2; i++) { a[2*i] = c[i] * b[i] + d[i] * b[i] + c[i] * c[i] + d[i] * b[i] + d[i] * c[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } } ``` Clang's codegen: ``` .LBB0_3: // Parent Loop BB0_2 Depth=1 ld1w { z4.s }, p0/z, [x19, x8, lsl #2] ld1w { z2.s }, p0/z, [x21, x8, lsl #2] ld1w { z3.s }, p0/z, [x20, x8, lsl #2] add x8, x8, x23 cmp x28, x8 fadd z5.s, z4.s, z4.s fmul z5.s, z3.s, z5.s fadd z3.s, z3.s, z2.s fadd z3.s, z3.s, z4.s lsl z4.d, z0.d, #1 add z0.d, z0.d, z6.d fmad z2.s, p0/m, z3.s, z5.s lsl z3.d, z1.d, #1 add z1.d, z1.d, z6.d uunpklo z5.d, z2.s uunpkhi z2.d, z2.s st1w { z5.d }, p1, [x22, z4.d, lsl #2] st1w { z2.d }, p1, [x22, z3.d, lsl #2] b.ne .LBB0_3 ``` vs. GCC's codegen: ``` .L3: ldr q29, [x25, x0] mov x7, x4 add x6, x4, 16 add x5, x4, 24 add x4, x4, 32 ldr q30, [x24, x0] ldr q28, [x23, x0] add x0, x0, 16 mov v31.16b, v29.16b fmla v31.4s, v30.4s, v27.4s fadd v30.4s, v30.4s, v29.4s fmul v31.4s, v31.4s, v28.4s fmla v31.4s, v30.4s, v29.4s str s31, [x7], 8 st1 {v31.s}[1], [x7] st1 {v31.s}[2], [x6] st1 {v31.s}[3], [x5] cmp x0, x26 bne .L3 ``` See also: https://godbolt.org/z/fnfa9eb3E TODO: Root cause analysis.

vfdff commented 8 months ago

The llvm also can get VLS style code with -mllvm --scalable-vectorization=off, https://godbolt.org/z/v7caodEh8

llvm / llvm-project

[AArch64] VLA slower than VLS (tsvc, s1111) #71524