[AArch64] Missed vectorisation opportunity (tsvc, s172)

llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.

Other

28.52k stars 11.79k forks source link

We are not vectorising kernel s172 from TSVS and are 3x behind compared to GCC as a result. Compile this input with -O3 -ffast-math -mcpu=neoverse-v2:

__attribute__((aligned(64))) float x[32000];

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float  s172(int xa, int xb)
{
    int n1 = xa;
    int n3 = xb;

    for (int nl = 0; nl < 100000; nl++) {
        for (int i = n1-1; i < 32000; i += n3) {
            a[i] += b[i];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
}

Clang's codegen:

.LBB0_3:                                //   Parent Loop BB0_2 Depth=1
        ldr     s0, [x19, x8, lsl #2]
        ldr     s1, [x20, x8, lsl #2]
        fadd    s0, s1, s0
        str     s0, [x20, x8, lsl #2]
        add     x8, x8, x22
        cmp     x8, x23
        b.lt    .LBB0_3

GCC's codegen:

        whilelo p7.s, wzr, w28
.L5:
        ld1w    z31.s, p7/z, [x19, x0, lsl 2]
        ld1w    z30.s, p7/z, [x27, x0, lsl 2]
        fadd    z31.s, z31.s, z30.s
        st1w    z31.s, p7, [x19, x0, lsl 2]
        add     x0, x0, x2
        whilelo p7.s, w0, w28
        b.any   .L5

TODO: root cause analysis.

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

We are not vectorising kernel s172 from TSVS and are 3x behind compared to GCC as a result. Compile this input with `-O3 -ffast-math -mcpu=neoverse-v2`: ``` __attribute__((aligned(64))) float x[32000]; __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s172(int xa, int xb) { int n1 = xa; int n3 = xb; for (int nl = 0; nl < 100000; nl++) { for (int i = n1-1; i < 32000; i += n3) { a[i] += b[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } } ``` Clang's codegen: ``` .LBB0_3: // Parent Loop BB0_2 Depth=1 ldr s0, [x19, x8, lsl #2] ldr s1, [x20, x8, lsl #2] fadd s0, s1, s0 str s0, [x20, x8, lsl #2] add x8, x8, x22 cmp x8, x23 b.lt .LBB0_3 ``` GCC's codegen: ``` whilelo p7.s, wzr, w28 .L5: ld1w z31.s, p7/z, [x19, x0, lsl 2] ld1w z30.s, p7/z, [x27, x0, lsl 2] fadd z31.s, z31.s, z30.s st1w z31.s, p7, [x19, x0, lsl 2] add x0, x0, x2 whilelo p7.s, w0, w28 b.any .L5 ``` See also: https://godbolt.org/z/W8eEPKqET TODO: root cause analysis.

llvm / llvm-project

[AArch64] Missed vectorisation opportunity (tsvc, s172) #71517