llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.25k stars 11.67k forks source link

[AArch64][SVE] Cannot be vectorized, but GCC can vectorize.(TSVC s1161) #82213

Open m-saito-fj opened 7 months ago

m-saito-fj commented 7 months ago

Clang cannot SVE vectorize TSVC s1161, but GCC13.2.0 can.

Option: -Ofast -march=armv8.2-a+sve

#define LEN 32000
#define LEN2 256
static int ntimes = 200000;

float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2];

int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN],
          float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float);

int s1161()
{
        for (int nl = 0; nl < ntimes; nl++) {
                for (int i = 0; i < LEN-1; ++i) {
                        if (c[i] < (float)0.) {
                                goto L20;
                        }
                        a[i] = c[i] + d[i] * e[i];
                        goto L10;
L20:
                        b[i] = a[i] + d[i] * d[i];
L10:
                        ;
                }
                dummy(a, b, c, d, e, aa, bb, cc, 0.);
        }
        return 0;
}

See also (Clang vs GCC): https://godbolt.org/z/EEq8Mj5zs

GCC result:

.L2:
        ld1w    z2.s, p0/z, [x27, x5, lsl 2]
        lsl     x0, x5, 2
        fcmge   p1.s, p2/z, z2.s, #0.0
        ld1w    z1.s, p0/z, [x26, x5, lsl 2]
        add     x6, x25, x0
        add     x7, x20, x0
        ld1w    z3.s, p0/z, [x6]
        ld1w    z5.s, p0/z, [x7]
        add     x0, x24, x0
        sel     z4.s, p1, z2.s, z3.s
        ld1w    z0.s, p0/z, [x0]
        fmla    z4.s, p1/m, z1.s, z5.s
        fcmlt   p1.s, p2/z, z2.s, #0.0
        movprfx z0.s, p1/m, z3.s
        fmla    z0.s, p1/m, z1.s, z1.s
        st1w    z4.s, p0, [x6]
        st1w    z0.s, p0, [x0]
        add     x5, x5, x28
        whilelo p0.s, w5, w19
        b.any   .L2

Loop Body IR:

for.body4:                                        ; preds = %for.cond1.preheader, %for.inc
  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
  %arrayidx = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv, !dbg !21
  %0 = load float, ptr %arrayidx, align 4, !dbg !21, !tbaa !22
  %cmp5 = fcmp fast olt float %0, 0.000000e+00, !dbg !26
  br i1 %cmp5, label %L20, label %if.end, !dbg !21

if.end:                                           ; preds = %for.body4
  %arrayidx9 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv, !dbg !27
  %1 = load float, ptr %arrayidx9, align 4, !dbg !27, !tbaa !22
  %arrayidx11 = getelementptr inbounds [32000 x float], ptr @e, i64 0, i64 %indvars.iv, !dbg !28
  %2 = load float, ptr %arrayidx11, align 4, !dbg !28, !tbaa !22
  %mul = fmul fast float %2, %1, !dbg !29
  %add = fadd fast float %mul, %0, !dbg !30
  br label %for.inc, !dbg !31

L20:                                              ; preds = %for.body4
  %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv, !dbg !32
  %3 = load float, ptr %arrayidx15, align 4, !dbg !32, !tbaa !22
  %arrayidx17 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv, !dbg !33
  %4 = load float, ptr %arrayidx17, align 4, !dbg !33, !tbaa !22
  %mul20 = fmul fast float %4, %4, !dbg !34
  %add21 = fadd fast float %mul20, %3, !dbg !35
  br label %for.inc, !dbg !36

for.inc:                                          ; preds = %if.end, %L20
  %a.sink = phi ptr [ @a, %if.end ], [ @b, %L20 ]
  %add.sink = phi float [ %add, %if.end ], [ %add21, %L20 ]
  %arrayidx13 = getelementptr inbounds [32000 x float], ptr %a.sink, i64 0, i64 %indvars.iv, !dbg !37
  store float %add.sink, ptr %arrayidx13, align 4, !dbg !37, !tbaa !22
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !38
  %exitcond.not = icmp eq i64 %indvars.iv.next, 31999, !dbg !39
  br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4, !dbg !13, !llvm.loop !40

-mllvm -debug-only=loop-accesses messages:

LAA: Can't find bounds for ptr:  %arrayidx13 = getelementptr inbounds [32000 x float], ptr %a.sink, i64 0, i64 %indvars.iv, !dbg !37
LAA: Found a runtime check ptr:  %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv, !dbg !32
LAA: We need to do 0 pointer comparisons.
LAA: We can't vectorize because we can't find the array bounds.

The direct factor is the missing ptr boundary. Phi for selecting base address of store in "%a.sink" of "for.inc" block seems to be affected.

llvmbot commented 7 months ago

@llvm/issue-subscribers-backend-aarch64

Author: m-saito-fj (m-saito-fj)

Clang cannot SVE vectorize TSVC s1161, but GCC13.2.0 can. Option: `-Ofast -march=armv8.2-a+sve` ```c #define LEN 32000 #define LEN2 256 static int ntimes = 200000; float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN]; float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2]; int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN], float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float); int s1161() { for (int nl = 0; nl < ntimes; nl++) { for (int i = 0; i < LEN-1; ++i) { if (c[i] < (float)0.) { goto L20; } a[i] = c[i] + d[i] * e[i]; goto L10; L20: b[i] = a[i] + d[i] * d[i]; L10: ; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return 0; } ``` See also (Clang vs GCC): https://godbolt.org/z/EEq8Mj5zs GCC result: ```asm .L2: ld1w z2.s, p0/z, [x27, x5, lsl 2] lsl x0, x5, 2 fcmge p1.s, p2/z, z2.s, #0.0 ld1w z1.s, p0/z, [x26, x5, lsl 2] add x6, x25, x0 add x7, x20, x0 ld1w z3.s, p0/z, [x6] ld1w z5.s, p0/z, [x7] add x0, x24, x0 sel z4.s, p1, z2.s, z3.s ld1w z0.s, p0/z, [x0] fmla z4.s, p1/m, z1.s, z5.s fcmlt p1.s, p2/z, z2.s, #0.0 movprfx z0.s, p1/m, z3.s fmla z0.s, p1/m, z1.s, z1.s st1w z4.s, p0, [x6] st1w z0.s, p0, [x0] add x5, x5, x28 whilelo p0.s, w5, w19 b.any .L2 ``` Loop Body IR: ```llvm for.body4: ; preds = %for.cond1.preheader, %for.inc %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ] %arrayidx = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv, !dbg !21 %0 = load float, ptr %arrayidx, align 4, !dbg !21, !tbaa !22 %cmp5 = fcmp fast olt float %0, 0.000000e+00, !dbg !26 br i1 %cmp5, label %L20, label %if.end, !dbg !21 if.end: ; preds = %for.body4 %arrayidx9 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv, !dbg !27 %1 = load float, ptr %arrayidx9, align 4, !dbg !27, !tbaa !22 %arrayidx11 = getelementptr inbounds [32000 x float], ptr @e, i64 0, i64 %indvars.iv, !dbg !28 %2 = load float, ptr %arrayidx11, align 4, !dbg !28, !tbaa !22 %mul = fmul fast float %2, %1, !dbg !29 %add = fadd fast float %mul, %0, !dbg !30 br label %for.inc, !dbg !31 L20: ; preds = %for.body4 %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv, !dbg !32 %3 = load float, ptr %arrayidx15, align 4, !dbg !32, !tbaa !22 %arrayidx17 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv, !dbg !33 %4 = load float, ptr %arrayidx17, align 4, !dbg !33, !tbaa !22 %mul20 = fmul fast float %4, %4, !dbg !34 %add21 = fadd fast float %mul20, %3, !dbg !35 br label %for.inc, !dbg !36 for.inc: ; preds = %if.end, %L20 %a.sink = phi ptr [ @a, %if.end ], [ @b, %L20 ] %add.sink = phi float [ %add, %if.end ], [ %add21, %L20 ] %arrayidx13 = getelementptr inbounds [32000 x float], ptr %a.sink, i64 0, i64 %indvars.iv, !dbg !37 store float %add.sink, ptr %arrayidx13, align 4, !dbg !37, !tbaa !22 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !38 %exitcond.not = icmp eq i64 %indvars.iv.next, 31999, !dbg !39 br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4, !dbg !13, !llvm.loop !40 ``` `-mllvm -debug-only=loop-accesses` messages: ``` LAA: Can't find bounds for ptr: %arrayidx13 = getelementptr inbounds [32000 x float], ptr %a.sink, i64 0, i64 %indvars.iv, !dbg !37 LAA: Found a runtime check ptr: %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv, !dbg !32 LAA: We need to do 0 pointer comparisons. LAA: We can't vectorize because we can't find the array bounds. ``` The direct factor is the missing ptr boundary. Phi for selecting base address of store in "%a.sink" of "for.inc" block seems to be affected.