llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.25k stars 11.66k forks source link

[AArch64][SVE] Cannot be vectorized, but GCC can vectorize.(TSVC s235) #81112

Open m-saito-fj opened 7 months ago

m-saito-fj commented 7 months ago

Clang cannot SVE vectorize TSVC s235, but GCC13.2.0 can.

Option: -Ofast -march=armv8.2-a+sve

#define LEN 32000
#define LEN2 256
static int ntimes = 200000;

float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2];

int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN],
          float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float);

int s235()
{
        for (int nl = 0; nl < 200*(ntimes/LEN2); nl++) {
                for (int i = 0; i < LEN2; i++) {
                        a[i] += b[i] * c[i];
                        for (int j = 1; j < LEN2; j++) {
                                aa[j][i] = aa[j-1][i] + bb[j][i] * a[i];
                        }
                }
                dummy(a, b, c, d, e, aa, bb, cc, 0.);
        }
        return 0;
}

See also (Clang vs GCC): https://godbolt.org/z/KeeK58oz7

GCC result:

.L4:
        add     x8, x9, 1024
        mov     x0, 0
        lsl     x11, x12, 2
        add     x13, x20, x11
        add     x15, x1, x11
        ld1w    z1.s, p0/z, [x13]
        ld1w    z0.s, p0/z, [x15]
        add     x11, x2, x11
        ld1w    z2.s, p0/z, [x11]
        fmad    z2.s, p1/m, z0.s, z1.s
        st1w    z2.s, p0, [x13]
.L3:
        ld1w    z1.s, p0/z, [x9, x0, lsl 2]
        ld1w    z0.s, p0/z, [x10, x0, lsl 2]
        fmad    z0.s, p1/m, z2.s, z1.s
        st1w    z0.s, p0, [x8, x0, lsl 2]
        add     x0, x0, 256
        cmp     x0, x19
        bne     .L3
        add     x12, x12, x16
        add     x9, x9, x21
        add     x10, x10, x21
        whilelo p0.s, w12, w14
        b.any   .L4

Regarding this result, it appears to me that it is vectorized for i in the outer loop.

-mllvm -debug-only=loop-vectorize messages:

LV: Checking a loop in 's235' from s235.c:19:4
LV: Loop hints: force=? width=vscale x 0 interleave=0
LV: Found a loop: for.body13
LV: Not vectorizing: Found an unidentified PHI   %3 = phi float [ %.pre, %for.body4 ], [ %add25, %for.body13 ], !dbg !30
LV: Interleaving disabled by the pass manager
LV: Can't vectorize the instructions or CFG

LLVM does not appear to be able to account for vectorization of loops of the form s235.c.

llvmbot commented 7 months ago

@llvm/issue-subscribers-backend-aarch64

Author: m-saito-fj (m-saito-fj)

Clang cannot SVE vectorize TSVC s235, but GCC13.2.0 can. Option: `-Ofast -march=armv8.2-a+sve` ```c #define LEN 32000 #define LEN2 256 static int ntimes = 200000; float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN]; float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2]; int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN], float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float); int s235() { for (int nl = 0; nl < 200*(ntimes/LEN2); nl++) { for (int i = 0; i < LEN2; i++) { a[i] += b[i] * c[i]; for (int j = 1; j < LEN2; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] * a[i]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return 0; } ``` See also (Clang vs GCC): https://godbolt.org/z/KeeK58oz7 GCC result: ```asm .L4: add x8, x9, 1024 mov x0, 0 lsl x11, x12, 2 add x13, x20, x11 add x15, x1, x11 ld1w z1.s, p0/z, [x13] ld1w z0.s, p0/z, [x15] add x11, x2, x11 ld1w z2.s, p0/z, [x11] fmad z2.s, p1/m, z0.s, z1.s st1w z2.s, p0, [x13] .L3: ld1w z1.s, p0/z, [x9, x0, lsl 2] ld1w z0.s, p0/z, [x10, x0, lsl 2] fmad z0.s, p1/m, z2.s, z1.s st1w z0.s, p0, [x8, x0, lsl 2] add x0, x0, 256 cmp x0, x19 bne .L3 add x12, x12, x16 add x9, x9, x21 add x10, x10, x21 whilelo p0.s, w12, w14 b.any .L4 ``` Regarding this result, it appears to me that it is vectorized for i in the outer loop. `-mllvm -debug-only=loop-vectorize` messages: ``` LV: Checking a loop in 's235' from s235.c:19:4 LV: Loop hints: force=? width=vscale x 0 interleave=0 LV: Found a loop: for.body13 LV: Not vectorizing: Found an unidentified PHI %3 = phi float [ %.pre, %for.body4 ], [ %add25, %for.body13 ], !dbg !30 LV: Interleaving disabled by the pass manager LV: Can't vectorize the instructions or CFG ``` LLVM does not appear to be able to account for vectorization of loops of the form s235.c.