Open sjoerdmeijer opened 11 months ago
Clang tip of tree is 120% behind GCC for kernel s128 in TSVC. Compile this input with -O3 -ffast-math -mcpu=neoverse-v2:
-O3 -ffast-math -mcpu=neoverse-v2
__attribute__((aligned(64))) float x[32000]; __attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000], aa[256][256],bb[256][256],cc[256][256],tt[256][256]; int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float); float s128(struct args_t * func_args) { int j, k; for (int nl = 0; nl < 2*100000; nl++) { j = -1; for (int i = 0; i < 32000/2; i++) { k = j + 1; a[i] = b[k] - d[i]; j = k + 1; b[k] = a[i] + c[k]; } dummy(a, b, c, d, e, aa, bb, cc, 1.); } }
GCC's codegen:
.L3: ld2 {v28.4s - v29.4s}, [x0] mov x6, x0 add x5, x0, 16 add x4, x0, 24 add x0, x0, 32 ldr q27, [x8], 16 ld2 {v30.4s - v31.4s}, [x7], 32 fsub v28.4s, v28.4s, v27.4s fadd v30.4s, v28.4s, v30.4s str q28, [x9], 16 str s30, [x6], 8 st1 {v30.s}[1], [x6] st1 {v30.s}[2], [x5] st1 {v30.s}[3], [x4] cmp x7, x23 bne .L3
Clang's codegen:
.LBB0_2: // Parent Loop BB0_1 Depth=1 mov z2.d, z0.d add z2.d, z2.d, #1 // =0x1 adr z3.d, [z7.d, z2.d, lsl #2] fmov x9, d3 ld2w { z3.s, z4.s }, p0/z, [x9] ld1w { z5.s }, p0/z, [x21, x8, lsl #2] fmov x9, d2 fsub z3.s, z3.s, z5.s st1w { z3.s }, p0, [x22, x8, lsl #2] add x8, x8, x28 ld2w { z4.s, z5.s }, p0/z, [x20, x9, lsl #2] add x9, x19, #4 cmp x23, x8 fadd z2.s, z4.s, z3.s uunpklo z3.d, z2.s uunpkhi z2.d, z2.s st1w { z3.d }, p1, [x9, z0.d, lsl #2] add z0.d, z0.d, z6.d st1w { z2.d }, p1, [x9, z1.d, lsl #2] add z1.d, z1.d, z6.d b.ne .LBB0_2
See also:
https://godbolt.org/z/154McGMve
Todo: root cause analysis
@llvm/issue-subscribers-backend-aarch64
Author: Sjoerd Meijer (sjoerdmeijer)
Clang tip of tree is 120% behind GCC for kernel s128 in TSVC. Compile this input with
-O3 -ffast-math -mcpu=neoverse-v2
:GCC's codegen:
Clang's codegen:
See also:
https://godbolt.org/z/154McGMve
Todo: root cause analysis