[AArch64] suboptimal vectorisation (tsvc, s128)

llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.

Other

28.55k stars 11.8k forks source link

Clang tip of tree is 120% behind GCC for kernel s128 in TSVC. Compile this input with -O3 -ffast-math -mcpu=neoverse-v2:

__attribute__((aligned(64))) float x[32000];

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s128(struct args_t * func_args)
{
    int j, k;
    for (int nl = 0; nl < 2*100000; nl++) {
        j = -1;
        for (int i = 0; i < 32000/2; i++) {
            k = j + 1;
            a[i] = b[k] - d[i];
            j = k + 1;
            b[k] = a[i] + c[k];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 1.);
    }   
}

GCC's codegen:

.L3:
        ld2     {v28.4s - v29.4s}, [x0]
        mov     x6, x0
        add     x5, x0, 16
        add     x4, x0, 24
        add     x0, x0, 32
        ldr     q27, [x8], 16
        ld2     {v30.4s - v31.4s}, [x7], 32
        fsub    v28.4s, v28.4s, v27.4s
        fadd    v30.4s, v28.4s, v30.4s
        str     q28, [x9], 16
        str     s30, [x6], 8
        st1     {v30.s}[1], [x6]
        st1     {v30.s}[2], [x5]
        st1     {v30.s}[3], [x4]
        cmp     x7, x23
        bne     .L3

Clang's codegen:

.LBB0_2:                                //   Parent Loop BB0_1 Depth=1
        mov     z2.d, z0.d
        add     z2.d, z2.d, #1                  // =0x1
        adr     z3.d, [z7.d, z2.d, lsl #2]
        fmov    x9, d3
        ld2w    { z3.s, z4.s }, p0/z, [x9]
        ld1w    { z5.s }, p0/z, [x21, x8, lsl #2]
        fmov    x9, d2
        fsub    z3.s, z3.s, z5.s
        st1w    { z3.s }, p0, [x22, x8, lsl #2]
        add     x8, x8, x28
        ld2w    { z4.s, z5.s }, p0/z, [x20, x9, lsl #2]
        add     x9, x19, #4
        cmp     x23, x8
        fadd    z2.s, z4.s, z3.s
        uunpklo z3.d, z2.s
        uunpkhi z2.d, z2.s
        st1w    { z3.d }, p1, [x9, z0.d, lsl #2]
        add     z0.d, z0.d, z6.d
        st1w    { z2.d }, p1, [x9, z1.d, lsl #2]
        add     z1.d, z1.d, z6.d
        b.ne    .LBB0_2

See also:

https://godbolt.org/z/154McGMve

Todo: root cause analysis

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

Clang tip of tree is 120% behind GCC for kernel s128 in TSVC. Compile this input with `-O3 -ffast-math -mcpu=neoverse-v2`: ``` __attribute__((aligned(64))) float x[32000]; __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s128(struct args_t * func_args) { int j, k; for (int nl = 0; nl < 2*100000; nl++) { j = -1; for (int i = 0; i < 32000/2; i++) { k = j + 1; a[i] = b[k] - d[i]; j = k + 1; b[k] = a[i] + c[k]; } dummy(a, b, c, d, e, aa, bb, cc, 1.); } } ``` GCC's codegen: ``` .L3: ld2 {v28.4s - v29.4s}, [x0] mov x6, x0 add x5, x0, 16 add x4, x0, 24 add x0, x0, 32 ldr q27, [x8], 16 ld2 {v30.4s - v31.4s}, [x7], 32 fsub v28.4s, v28.4s, v27.4s fadd v30.4s, v28.4s, v30.4s str q28, [x9], 16 str s30, [x6], 8 st1 {v30.s}[1], [x6] st1 {v30.s}[2], [x5] st1 {v30.s}[3], [x4] cmp x7, x23 bne .L3 ``` Clang's codegen: ``` .LBB0_2: // Parent Loop BB0_1 Depth=1 mov z2.d, z0.d add z2.d, z2.d, #1 // =0x1 adr z3.d, [z7.d, z2.d, lsl #2] fmov x9, d3 ld2w { z3.s, z4.s }, p0/z, [x9] ld1w { z5.s }, p0/z, [x21, x8, lsl #2] fmov x9, d2 fsub z3.s, z3.s, z5.s st1w { z3.s }, p0, [x22, x8, lsl #2] add x8, x8, x28 ld2w { z4.s, z5.s }, p0/z, [x20, x9, lsl #2] add x9, x19, #4 cmp x23, x8 fadd z2.s, z4.s, z3.s uunpklo z3.d, z2.s uunpkhi z2.d, z2.s st1w { z3.d }, p1, [x9, z0.d, lsl #2] add z0.d, z0.d, z6.d st1w { z2.d }, p1, [x9, z1.d, lsl #2] add z1.d, z1.d, z6.d b.ne .LBB0_2 ``` See also: https://godbolt.org/z/154McGMve Todo: root cause analysis

llvm / llvm-project

[AArch64] suboptimal vectorisation (tsvc, s128) #71512