llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.95k stars 11.94k forks source link

[AArch64] VLA slower than VLS (tsvc, s176) #71523

Open sjoerdmeijer opened 1 year ago

sjoerdmeijer commented 1 year ago

Clang generates a VLA style vector loop, and GCC a VLS vector loop. It looks like we are about 50% slower as a result. Compile this input with -O3 -mcpu=neoverse-v2 -ffast-math:

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s176()
{
    int m = 32000/2;
    for (int nl = 0; nl < 4*(100000/32000); nl++) {
        for (int j = 0; j < (32000/2); j++) {
            for (int i = 0; i < m; i++) {
                a[i] += b[i+m-j-1] * c[j];
            }
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

}

Clang's codegen:

.LBB0_5:                                //   Parent Loop BB0_2 Depth=1
        ld1w    { z2.s }, p0/z, [x12]
        ld1b    { z3.b }, p1/z, [x12, x28]
        ld1w    { z4.s }, p0/z, [x11]
        ld1b    { z5.b }, p1/z, [x11, x28]
        add     x12, x12, x23
        subs    x10, x10, x22
        fmad    z2.s, p0/m, z1.s, z4.s
        fmad    z3.s, p0/m, z1.s, z5.s
        st1w    { z2.s }, p0, [x11]
        st1b    { z3.b }, p1, [x11, x28]
        add     x11, x11, x23
        b.ne    .LBB0_5

vs. GCC's codegen:

.L3:
        ldr     q29, [x8, x0]
        ldr     q31, [x19, x0]
        ldr     q30, [x9, x0]
        fmla    v31.4s, v28.4s, v29.4s
        fmla    v31.4s, v27.4s, v30.4s
        str     q31, [x19, x0]
        add     x0, x0, 16
        cmp     x0, x22
        bne     .L3

See also: https://godbolt.org/z/64nhv1o6z

TODO: Root cause analysis.

llvmbot commented 1 year ago

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

Clang generates a VLA style vector loop, and GCC a VLS vector loop. It looks like we are about 50% slower as a result. Compile this input with `-O3 -mcpu=neoverse-v2 -ffast-math`: ``` __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s176() { int m = 32000/2; for (int nl = 0; nl < 4*(100000/32000); nl++) { for (int j = 0; j < (32000/2); j++) { for (int i = 0; i < m; i++) { a[i] += b[i+m-j-1] * c[j]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } } ``` Clang's codegen: ``` .LBB0_5: // Parent Loop BB0_2 Depth=1 ld1w { z2.s }, p0/z, [x12] ld1b { z3.b }, p1/z, [x12, x28] ld1w { z4.s }, p0/z, [x11] ld1b { z5.b }, p1/z, [x11, x28] add x12, x12, x23 subs x10, x10, x22 fmad z2.s, p0/m, z1.s, z4.s fmad z3.s, p0/m, z1.s, z5.s st1w { z2.s }, p0, [x11] st1b { z3.b }, p1, [x11, x28] add x11, x11, x23 b.ne .LBB0_5 ``` vs. GCC's codegen: ``` .L3: ldr q29, [x8, x0] ldr q31, [x19, x0] ldr q30, [x9, x0] fmla v31.4s, v28.4s, v29.4s fmla v31.4s, v27.4s, v30.4s str q31, [x19, x0] add x0, x0, 16 cmp x0, x22 bne .L3 ``` See also: https://godbolt.org/z/64nhv1o6z TODO: Root cause analysis.