llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.52k stars 11.79k forks source link

[AArch64] Missed vectorisation opportunity (tsvc, s172) #71517

Closed sjoerdmeijer closed 8 months ago

sjoerdmeijer commented 11 months ago

We are not vectorising kernel s172 from TSVS and are 3x behind compared to GCC as a result. Compile this input with -O3 -ffast-math -mcpu=neoverse-v2:

__attribute__((aligned(64))) float x[32000];

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
                                   aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float  s172(int xa, int xb)
{
    int n1 = xa;
    int n3 = xb;

    for (int nl = 0; nl < 100000; nl++) {
        for (int i = n1-1; i < 32000; i += n3) {
            a[i] += b[i];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
}

Clang's codegen:

.LBB0_3:                                //   Parent Loop BB0_2 Depth=1
        ldr     s0, [x19, x8, lsl #2]
        ldr     s1, [x20, x8, lsl #2]
        fadd    s0, s1, s0
        str     s0, [x20, x8, lsl #2]
        add     x8, x8, x22
        cmp     x8, x23
        b.lt    .LBB0_3

GCC's codegen:

        whilelo p7.s, wzr, w28
.L5:
        ld1w    z31.s, p7/z, [x19, x0, lsl 2]
        ld1w    z30.s, p7/z, [x27, x0, lsl 2]
        fadd    z31.s, z31.s, z30.s
        st1w    z31.s, p7, [x19, x0, lsl 2]
        add     x0, x0, x2
        whilelo p7.s, w0, w28
        b.any   .L5

See also: https://godbolt.org/z/W8eEPKqET

TODO: root cause analysis.

llvmbot commented 11 months ago

@llvm/issue-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

We are not vectorising kernel s172 from TSVS and are 3x behind compared to GCC as a result. Compile this input with `-O3 -ffast-math -mcpu=neoverse-v2`: ``` __attribute__((aligned(64))) float x[32000]; __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s172(int xa, int xb) { int n1 = xa; int n3 = xb; for (int nl = 0; nl < 100000; nl++) { for (int i = n1-1; i < 32000; i += n3) { a[i] += b[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } } ``` Clang's codegen: ``` .LBB0_3: // Parent Loop BB0_2 Depth=1 ldr s0, [x19, x8, lsl #2] ldr s1, [x20, x8, lsl #2] fadd s0, s1, s0 str s0, [x20, x8, lsl #2] add x8, x8, x22 cmp x8, x23 b.lt .LBB0_3 ``` GCC's codegen: ``` whilelo p7.s, wzr, w28 .L5: ld1w z31.s, p7/z, [x19, x0, lsl 2] ld1w z30.s, p7/z, [x27, x0, lsl 2] fadd z31.s, z31.s, z30.s st1w z31.s, p7, [x19, x0, lsl 2] add x0, x0, x2 whilelo p7.s, w0, w28 b.any .L5 ``` See also: https://godbolt.org/z/W8eEPKqET TODO: root cause analysis.