[AArch64] Missed doubly loop fmla vectorisation (tsvc, s235)

llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.

Other

29.32k stars 12.11k forks source link

GCC12 vectorises the statements in both the outer and inner loop. Clang doesn't do any vectorisation. As a result, we are about 90% behind for kernel s235 in TSVC.

Compile this input with -O3 -mcpu=neoverse-v2 -ffast-math:

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s235()
{
    for (int nl = 0; nl < 200*(100000/256); nl++) {
        for (int i = 0; i < 256; i++) {
            a[i] += b[i] * c[i];
            for (int j = 1; j < 256; j++) {
                aa[j][i] = aa[j-1][i] + bb[j][i] * a[i];
            }
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
  return aa[1][2];
}

Clang's scalar codegen:

.LBB0_2:                                //   Parent Loop BB0_1 Depth=1
        ldr     s0, [x21, x8, lsl #2]
        ldr     s1, [x22, x8, lsl #2]
        ldr     s2, [x23, x8, lsl #2]
        mov     w11, #255                       // =0xff
        mov     x12, x9
        mov     x13, x10
        fmadd   s0, s1, s0, s2
        ldr     s1, [x20, x8, lsl #2]
        str     s0, [x23, x8, lsl #2]
.LBB0_3:                                //   Parent Loop BB0_1 Depth=1
        ldr     s2, [x13, #1024]
        subs    x11, x11, #3
        fmadd   s1, s2, s0, s1
        ldr     s2, [x13, #2048]
        str     s1, [x12, #1024]
        fmadd   s1, s2, s0, s1
        ldr     s2, [x13, #3072]
        add     x13, x13, #3072
        str     s1, [x12, #2048]
        fmadd   s1, s2, s0, s1
        str     s1, [x12, #3072]
        add     x12, x12, #3072
        b.ne    .LBB0_3
        add     x8, x8, #1
        add     x10, x10, #4
        add     x9, x9, #4
        cmp     x8, #256
        b.ne    .LBB0_2

vs. GCC's vector code:

.L4:
        add     x10, x22, x11
        sub     x9, x8, #1024
        ldr     q29, [x21, x11]
        mov     x0, 0
        ldr     q30, [x2, x11]
        ldr     q31, [x28, x11]
        fmla    v29.4s, v30.4s, v31.4s
        str     q29, [x21, x11]
.L3:
        ldr     q30, [x10, x0]
        ldr     q31, [x9, x0]
        fmla    v31.4s, v30.4s, v29.4s
        str     q31, [x8, x0]
        add     x0, x0, 1024
        cmp     x0, x19
        bne     .L3
        add     x11, x11, 16
        add     x8, x8, 16
        cmp     x11, 1024
        bne     .L4

TODO: Root cause analysis.

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

GCC12 vectorises the statements in both the outer and inner loop. Clang doesn't do any vectorisation. As a result, we are about 90% behind for kernel s235 in TSVC. Compile this input with `-O3 -mcpu=neoverse-v2 -ffast-math`: ``` __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s235() { for (int nl = 0; nl < 200*(100000/256); nl++) { for (int i = 0; i < 256; i++) { a[i] += b[i] * c[i]; for (int j = 1; j < 256; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] * a[i]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return aa[1][2]; } ``` Clang's scalar codegen: ``` .LBB0_2: // Parent Loop BB0_1 Depth=1 ldr s0, [x21, x8, lsl #2] ldr s1, [x22, x8, lsl #2] ldr s2, [x23, x8, lsl #2] mov w11, #255 // =0xff mov x12, x9 mov x13, x10 fmadd s0, s1, s0, s2 ldr s1, [x20, x8, lsl #2] str s0, [x23, x8, lsl #2] .LBB0_3: // Parent Loop BB0_1 Depth=1 ldr s2, [x13, #1024] subs x11, x11, #3 fmadd s1, s2, s0, s1 ldr s2, [x13, #2048] str s1, [x12, #1024] fmadd s1, s2, s0, s1 ldr s2, [x13, #3072] add x13, x13, #3072 str s1, [x12, #2048] fmadd s1, s2, s0, s1 str s1, [x12, #3072] add x12, x12, #3072 b.ne .LBB0_3 add x8, x8, #1 add x10, x10, #4 add x9, x9, #4 cmp x8, #256 b.ne .LBB0_2 ``` vs. GCC's vector code: ``` .L4: add x10, x22, x11 sub x9, x8, #1024 ldr q29, [x21, x11] mov x0, 0 ldr q30, [x2, x11] ldr q31, [x28, x11] fmla v29.4s, v30.4s, v31.4s str q29, [x21, x11] .L3: ldr q30, [x10, x0] ldr q31, [x9, x0] fmla v31.4s, v30.4s, v29.4s str q31, [x8, x0] add x0, x0, 1024 cmp x0, x19 bne .L3 add x11, x11, 16 add x8, x8, 16 cmp x11, 1024 bne .L4 ``` See also: https://godbolt.org/z/5fG1bffqz TODO: Root cause analysis.

llvm / llvm-project

[AArch64] Missed doubly loop fmla vectorisation (tsvc, s235) #71522