[AArch64] Missed fmla vectorisation opportunity (tsvc, s2275)

sjoerdmeijer commented 8 months ago

We are a lot behind (300%) for kernel s2275 in TSVC compared to GCC12.

Compile this input with -O3 -mcpu=neoverse-v2 -ffast-math:

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s2275(struct args_t * func_args)
{
    for (int nl = 0; nl < 100*(100000/256); nl++) {
        for (int i = 0; i < 256; i++) {
            for (int j = 0; j < 256; j++) {
                aa[j][i] = aa[j][i] + bb[j][i] * cc[j][i];
            }
            a[i] = b[i] + c[i] * d[i];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
}

Clang's codegen:

.LBB0_2:                                //   Parent Loop BB0_1 Depth=1
        mov     x12, xzr
.LBB0_3:                                //   Parent Loop BB0_1 Depth=1
        add     x14, x10, x12
        add     x13, x9, x12
        ldr     s2, [x14]
        ldr     s3, [x14, #1024]
        add     x14, x11, x12
        ldr     s0, [x13]
        ldr     s1, [x13, #1024]
        add     x12, x12, #2048
        ldr     s4, [x14]
        ldr     s5, [x14, #1024]
        cmp     x12, #64, lsl #12               // =262144
        fmadd   s0, s4, s2, s0
        fmadd   s1, s5, s3, s1
        str     s0, [x13]
        str     s1, [x13, #1024]
        b.ne    .LBB0_3
        ldr     s0, [x22, x8, lsl #2]
        ldr     s1, [x23, x8, lsl #2]
        ldr     s2, [x24, x8, lsl #2]
        add     x11, x11, #4
        add     x10, x10, #4
        add     x9, x9, #4
        fmadd   s0, s2, s1, s0
        str     s0, [x25, x8, lsl #2]
        add     x8, x8, #1
        cmp     x8, #256
        b.ne    .LBB0_2

vs. GCC's codegen:

.L6:
        mov     x0, 0
.L3:
        ldr     q29, [x10, x0]
        ldr     q30, [x9, x0]
        ldr     q31, [x8, x0]
        fmla    v31.4s, v29.4s, v30.4s
        str     q31, [x8, x0]
        add     x0, x0, 1024
        cmp     x0, 262144
        bne     .L3
        ldr     q29, [x27, x11]
        add     x8, x8, 16
        add     x10, x10, 16
        add     x9, x9, 16
        ldr     q30, [x26, x11]
        ldr     q31, [x25, x11]
        fmla    v31.4s, v29.4s, v30.4s
        str     q31, [x19, x11]
        add     x11, x11, 16
        cmp     x11, 1024
        bne     .L6

TODO: Root cause analysis.

llvmbot commented 8 months ago

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

We are a lot behind (300%) for kernel s2275 in TSVC compared to GCC12. Compile this input with {{}}: ``` __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s2275(struct args_t * func_args) { for (int nl = 0; nl < 100*(100000/256); nl++) { for (int i = 0; i < 256; i++) { for (int j = 0; j < 256; j++) { aa[j][i] = aa[j][i] + bb[j][i] * cc[j][i]; } a[i] = b[i] + c[i] * d[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } } ``` Clang's codegen: ``` .LBB0_2: // Parent Loop BB0_1 Depth=1 mov x12, xzr .LBB0_3: // Parent Loop BB0_1 Depth=1 add x14, x10, x12 add x13, x9, x12 ldr s2, [x14] ldr s3, [x14, #1024] add x14, x11, x12 ldr s0, [x13] ldr s1, [x13, #1024] add x12, x12, #2048 ldr s4, [x14] ldr s5, [x14, #1024] cmp x12, #64, lsl #12 // =262144 fmadd s0, s4, s2, s0 fmadd s1, s5, s3, s1 str s0, [x13] str s1, [x13, #1024] b.ne .LBB0_3 ldr s0, [x22, x8, lsl #2] ldr s1, [x23, x8, lsl #2] ldr s2, [x24, x8, lsl #2] add x11, x11, #4 add x10, x10, #4 add x9, x9, #4 fmadd s0, s2, s1, s0 str s0, [x25, x8, lsl #2] add x8, x8, #1 cmp x8, #256 b.ne .LBB0_2 ``` vs. GCC's codegen: ``` .L6: mov x0, 0 .L3: ldr q29, [x10, x0] ldr q30, [x9, x0] ldr q31, [x8, x0] fmla v31.4s, v29.4s, v30.4s str q31, [x8, x0] add x0, x0, 1024 cmp x0, 262144 bne .L3 ldr q29, [x27, x11] add x8, x8, 16 add x10, x10, 16 add x9, x9, 16 ldr q30, [x26, x11] ldr q31, [x25, x11] fmla v31.4s, v29.4s, v30.4s str q31, [x19, x11] add x11, x11, 16 cmp x11, 1024 bne .L6 ``` See also: https://godbolt.org/z/8E3fexn5o TODO: Root cause analysis.

fhahn commented 1 month ago

This would need interchanging the loops so the memory accesses are consecutive in the inner loop

fhahn commented 1 month ago

(this is also independent of AArch64)

llvm / llvm-project

[AArch64] Missed fmla vectorisation opportunity (tsvc, s2275) #71520