llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.32k stars 12.11k forks source link

[AArch64] Missed doubly loop fmla vectorisation (tsvc, s235) #71522

Open sjoerdmeijer opened 1 year ago

sjoerdmeijer commented 1 year ago

GCC12 vectorises the statements in both the outer and inner loop. Clang doesn't do any vectorisation. As a result, we are about 90% behind for kernel s235 in TSVC.

Compile this input with -O3 -mcpu=neoverse-v2 -ffast-math:

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s235()
{
    for (int nl = 0; nl < 200*(100000/256); nl++) {
        for (int i = 0; i < 256; i++) {
            a[i] += b[i] * c[i];
            for (int j = 1; j < 256; j++) {
                aa[j][i] = aa[j-1][i] + bb[j][i] * a[i];
            }
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
  return aa[1][2];
}

Clang's scalar codegen:

.LBB0_2:                                //   Parent Loop BB0_1 Depth=1
        ldr     s0, [x21, x8, lsl #2]
        ldr     s1, [x22, x8, lsl #2]
        ldr     s2, [x23, x8, lsl #2]
        mov     w11, #255                       // =0xff
        mov     x12, x9
        mov     x13, x10
        fmadd   s0, s1, s0, s2
        ldr     s1, [x20, x8, lsl #2]
        str     s0, [x23, x8, lsl #2]
.LBB0_3:                                //   Parent Loop BB0_1 Depth=1
        ldr     s2, [x13, #1024]
        subs    x11, x11, #3
        fmadd   s1, s2, s0, s1
        ldr     s2, [x13, #2048]
        str     s1, [x12, #1024]
        fmadd   s1, s2, s0, s1
        ldr     s2, [x13, #3072]
        add     x13, x13, #3072
        str     s1, [x12, #2048]
        fmadd   s1, s2, s0, s1
        str     s1, [x12, #3072]
        add     x12, x12, #3072
        b.ne    .LBB0_3
        add     x8, x8, #1
        add     x10, x10, #4
        add     x9, x9, #4
        cmp     x8, #256
        b.ne    .LBB0_2

vs. GCC's vector code:

.L4:
        add     x10, x22, x11
        sub     x9, x8, #1024
        ldr     q29, [x21, x11]
        mov     x0, 0
        ldr     q30, [x2, x11]
        ldr     q31, [x28, x11]
        fmla    v29.4s, v30.4s, v31.4s
        str     q29, [x21, x11]
.L3:
        ldr     q30, [x10, x0]
        ldr     q31, [x9, x0]
        fmla    v31.4s, v30.4s, v29.4s
        str     q31, [x8, x0]
        add     x0, x0, 1024
        cmp     x0, x19
        bne     .L3
        add     x11, x11, 16
        add     x8, x8, 16
        cmp     x11, 1024
        bne     .L4

See also: https://godbolt.org/z/5fG1bffqz

TODO: Root cause analysis.

llvmbot commented 1 year ago

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

GCC12 vectorises the statements in both the outer and inner loop. Clang doesn't do any vectorisation. As a result, we are about 90% behind for kernel s235 in TSVC. Compile this input with `-O3 -mcpu=neoverse-v2 -ffast-math`: ``` __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s235() { for (int nl = 0; nl < 200*(100000/256); nl++) { for (int i = 0; i < 256; i++) { a[i] += b[i] * c[i]; for (int j = 1; j < 256; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] * a[i]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return aa[1][2]; } ``` Clang's scalar codegen: ``` .LBB0_2: // Parent Loop BB0_1 Depth=1 ldr s0, [x21, x8, lsl #2] ldr s1, [x22, x8, lsl #2] ldr s2, [x23, x8, lsl #2] mov w11, #255 // =0xff mov x12, x9 mov x13, x10 fmadd s0, s1, s0, s2 ldr s1, [x20, x8, lsl #2] str s0, [x23, x8, lsl #2] .LBB0_3: // Parent Loop BB0_1 Depth=1 ldr s2, [x13, #1024] subs x11, x11, #3 fmadd s1, s2, s0, s1 ldr s2, [x13, #2048] str s1, [x12, #1024] fmadd s1, s2, s0, s1 ldr s2, [x13, #3072] add x13, x13, #3072 str s1, [x12, #2048] fmadd s1, s2, s0, s1 str s1, [x12, #3072] add x12, x12, #3072 b.ne .LBB0_3 add x8, x8, #1 add x10, x10, #4 add x9, x9, #4 cmp x8, #256 b.ne .LBB0_2 ``` vs. GCC's vector code: ``` .L4: add x10, x22, x11 sub x9, x8, #1024 ldr q29, [x21, x11] mov x0, 0 ldr q30, [x2, x11] ldr q31, [x28, x11] fmla v29.4s, v30.4s, v31.4s str q29, [x21, x11] .L3: ldr q30, [x10, x0] ldr q31, [x9, x0] fmla v31.4s, v30.4s, v29.4s str q31, [x8, x0] add x0, x0, 1024 cmp x0, x19 bne .L3 add x11, x11, 16 add x8, x8, 16 cmp x11, 1024 bne .L4 ``` See also: https://godbolt.org/z/5fG1bffqz TODO: Root cause analysis.