llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.08k stars 11.99k forks source link

[AArch64][SVE] Cannot be vectorized because unsafe dependent memory operation in loop (TSVC, s115) #80980

Closed m-saito-fj closed 4 months ago

m-saito-fj commented 9 months ago

Clang cannot SVE vectorize TSVC s115, but GCC13.2.0 can.

Option: -Ofast -march=armv8.2-a+sve

#define LEN 32000
#define LEN2 256
static int ntimes = 200000;

float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2];

int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN],
          float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float);

int s115()
{
        for (int nl = 0; nl < 1000*(ntimes/LEN2); nl++) {
                for (int j = 0; j < LEN2; j++) {
                        for (int i = j+1; i < LEN2; i++) {
                                a[i] -= aa[j][i] * a[j];
                        }
                }
                dummy(a, b, c, d, e, aa, bb, cc, 0.);
        }
        return 0;
}

See also (Clang vs GCC): https://godbolt.org/z/h8WE5raja

for.body IR:

for.body8.lr.ph:                                  ; preds = %for.body4
  %arrayidx12 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv37
  br label %for.body8, !dbg !26

for.body8:                                        ; preds = %for.body8.lr.ph, %for.body8
  %indvars.iv34 = phi i64 [ %indvars.iv, %for.body8.lr.ph ], [ %indvars.iv.next35, %for.body8 ]
  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv37, i64 %indvars.iv34, !dbg !27
  %0 = load float, ptr %arrayidx10, align 4, !dbg !27, !tbaa !28
  %1 = load float, ptr %arrayidx12, align 4, !dbg !32, !tbaa !28
  %mul13 = fmul fast float %1, %0, !dbg !33
  %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv34, !dbg !34
  %2 = load float, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28
  %sub = fsub fast float %2, %mul13, !dbg !35
  store float %sub, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28
  %indvars.iv.next35 = add nuw nsw i64 %indvars.iv34, 1, !dbg !36
  %exitcond.not = icmp eq i64 %indvars.iv.next35, 256, !dbg !25
  br i1 %exitcond.not, label %for.cond1.loopexit.loopexit, label %for.body8, !dbg !26, !llvm.loop !37
}

-mllvm -debug-only=loop-accesses messages:

LAA: Found a runtime check ptr:  %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv34, !dbg !34
LAA: Found a runtime check ptr:  %arrayidx12 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv37
LAA: We need to do 0 pointer comparisons.
LAA: May be able to perform a memory runtime check if needed.
LAA: Checking memory dependencies
LAA: Bad stride - Not striding over innermost loop   %arrayidx12 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv37 SCEV: {@a,+,4}<nw><%for.body4>
LAA: Src Scev: {@a,+,4}<nw><%for.body4>Sink Scev: {{(4 + @a)<nuw>,+,4}<nw><%for.body4>,+,4}<nuw><%for.body8>(Induction step: 0)
LAA: Distance for   %1 = load float, ptr %arrayidx12, align 4, !dbg !32, !tbaa !28 to   store float %sub, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28: {4,+,4}<nw><%for.body8>
Pointer access with non-constant stride
LAA: Src Scev: {{(4 + @a)<nuw>,+,4}<nw><%for.body4>,+,4}<nuw><%for.body8>Sink Scev: {{(4 + @a)<nuw>,+,4}<nw><%for.body4>,+,4}<nuw><%for.body8>(Induction step: 1)
LAA: Distance for   %2 = load float, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28 to   store float %sub, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28: 0
Total Dependences: 2
LAA: unsafe dependent memory operations in loop

There seems to be a problem with the two dependencies. 1) Load and store for a[i] 2) Store for a[i] and load for a[j]

GCC13 is able to vectorize by vector masked load/store.

.L4:
        ld1w    z1.s, p0/z, [x8, x0, lsl 2]
        ld1w    z0.s, p0/z, [x10, x0, lsl 2]
        fmsb    z0.s, p1/m, z2.s, z1.s
        st1w    z0.s, p0, [x8, x0, lsl 2]
        add     x0, x0, x20
        whilelo p0.s, w0, w9
        b.any   .L4
llvmbot commented 9 months ago

@llvm/issue-subscribers-backend-aarch64

Author: m-saito-fj (m-saito-fj)

Clang cannot SVE vectorize TSVC s114, but GCC13.2.0 can. Option: `-Ofast -march=armv8.2-a+sve` ```c #define LEN 32000 #define LEN2 256 static int ntimes = 200000; float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN]; float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2]; int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN], float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float); int s115() { for (int nl = 0; nl < 1000*(ntimes/LEN2); nl++) { for (int j = 0; j < LEN2; j++) { for (int i = j+1; i < LEN2; i++) { a[i] -= aa[j][i] * a[j]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return 0; } ``` See also (Clang vs GCC): https://godbolt.org/z/h8WE5raja for.body IR: ```llvm for.body8.lr.ph: ; preds = %for.body4 %arrayidx12 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv37 br label %for.body8, !dbg !26 for.body8: ; preds = %for.body8.lr.ph, %for.body8 %indvars.iv34 = phi i64 [ %indvars.iv, %for.body8.lr.ph ], [ %indvars.iv.next35, %for.body8 ] %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv37, i64 %indvars.iv34, !dbg !27 %0 = load float, ptr %arrayidx10, align 4, !dbg !27, !tbaa !28 %1 = load float, ptr %arrayidx12, align 4, !dbg !32, !tbaa !28 %mul13 = fmul fast float %1, %0, !dbg !33 %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv34, !dbg !34 %2 = load float, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28 %sub = fsub fast float %2, %mul13, !dbg !35 store float %sub, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28 %indvars.iv.next35 = add nuw nsw i64 %indvars.iv34, 1, !dbg !36 %exitcond.not = icmp eq i64 %indvars.iv.next35, 256, !dbg !25 br i1 %exitcond.not, label %for.cond1.loopexit.loopexit, label %for.body8, !dbg !26, !llvm.loop !37 } ``` `-mllvm -debug-only=loop-accesses` messages: ``` LAA: Found a runtime check ptr: %arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv34, !dbg !34 LAA: Found a runtime check ptr: %arrayidx12 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv37 LAA: We need to do 0 pointer comparisons. LAA: May be able to perform a memory runtime check if needed. LAA: Checking memory dependencies LAA: Bad stride - Not striding over innermost loop %arrayidx12 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv37 SCEV: {@a,+,4}<nw><%for.body4> LAA: Src Scev: {@a,+,4}<nw><%for.body4>Sink Scev: {{(4 + @a)<nuw>,+,4}<nw><%for.body4>,+,4}<nuw><%for.body8>(Induction step: 0) LAA: Distance for %1 = load float, ptr %arrayidx12, align 4, !dbg !32, !tbaa !28 to store float %sub, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28: {4,+,4}<nw><%for.body8> Pointer access with non-constant stride LAA: Src Scev: {{(4 + @a)<nuw>,+,4}<nw><%for.body4>,+,4}<nuw><%for.body8>Sink Scev: {{(4 + @a)<nuw>,+,4}<nw><%for.body4>,+,4}<nuw><%for.body8>(Induction step: 1) LAA: Distance for %2 = load float, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28 to store float %sub, ptr %arrayidx15, align 4, !dbg !35, !tbaa !28: 0 Total Dependences: 2 LAA: unsafe dependent memory operations in loop ``` There seems to be a problem with the two dependencies. 1) Load and store for a[i] 2) Store for a[i] and load for a[j] GCC13 is able to vectorize by vector masked load/store. ```asm .L4: ld1w z1.s, p0/z, [x8, x0, lsl 2] ld1w z0.s, p0/z, [x10, x0, lsl 2] fmsb z0.s, p1/m, z2.s, z1.s st1w z0.s, p0, [x8, x0, lsl 2] add x0, x0, x20 whilelo p0.s, w0, w9 b.any .L4 ```
fhahn commented 4 months ago

Looks like this gets vectorized now after some LoopAccessAnalaysis improvements