llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.16k stars 12.03k forks source link

[AArch64][SVE] Cannot be vectorized because the dependent distance is not constant (TSVC, s114) #80139

Open m-saito-fj opened 9 months ago

m-saito-fj commented 9 months ago

Clang cannot SVE vectorize TSVC s114, but GCC13.2.0 can.

Option: -Ofast -march=armv8.2-a+sve

#define LEN 32000
#define LEN2 256
static int ntimes = 200000;

float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2];

int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN],
          float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float);

int s114()
{
        for (int nl = 0; nl < 200*(ntimes/(LEN2)); nl++) {
                for (int i = 0; i < LEN2; i++) {
                        for (int j = 0; j < i; j++) {
                                aa[i][j] = aa[j][i] + bb[i][j];
                        }
                }
                dummy(a, b, c, d, e, aa, bb, cc, 0.);
        }
        return 0;
}

See also (Clang vs GCC): https://godbolt.org/z/frcoG3xz5

for.body IR:

for.body8:                                        ; preds = %for.body8.preheader, %for.body8
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body8 ], [ 0, %for.body8.preheader ]
  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv40, !dbg !27
  %0 = load float, ptr %arrayidx10, align 4, !dbg !27, !tbaa !28
  %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv40, i64 %indvars.iv, !dbg !32
  %1 = load float, ptr %arrayidx14, align 4, !dbg !32, !tbaa !28
  %add = fadd fast float %1, %0, !dbg !33
  %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv40, i64 %indvars.iv, !dbg !34
  store float %add, ptr %arrayidx18, align 4, !dbg !35, !tbaa !28
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !36
  %exitcond.not = icmp eq i64 %indvars.iv.next, %indvars.iv40, !dbg !15
  br i1 %exitcond.not, label %for.cond.cleanup7.loopexit, label %for.body8, !dbg !16, !llvm.loop !37

-mllvm -debug-only=loop-accesses messages:

LAA: Found a runtime check ptr:  %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv40, i64 %indvars.iv, !dbg !34
LAA: Found a runtime check ptr:  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv40, !dbg !27
LAA: We need to do 0 pointer comparisons.
LAA: May be able to perform a memory runtime check if needed.
LAA: Checking memory dependencies
LAA: Src Scev: {{@aa,+,4}<nw><%for.cond5.preheader>,+,1024}<nuw><%for.body8>Sink Scev: {{@aa,+,1024}<nw><%for.cond5.preheader>,+,4}<nuw><%for.body8>(Induction step: 256)
LAA: Distance for   %0 = load float, ptr %arrayidx10, align 4, !dbg !27, !tbaa !28 to   store float %add, ptr %arrayidx18, align 4, !dbg !35, !tbaa !28: {{0,+,1020}<%for.cond5.preheader>,+,-1020}<%for.body8>
Pointer access with non-constant stride
Total Dependences: 1
LAA: unsafe dependent memory operations in loop

The direct factor seems to be that the distance between the load of a[j][i] and the store of a[i][j] is not constant.

On the other hand, GCC13 is able to vectorize by vector masked load/store.

.L4:
        ld1w    z31.s, p7/z, [x8, z29.s, uxtw]
        ld1w    z30.s, p7/z, [x11, x0, lsl 2]
        fadd    z31.s, z31.s, z30.s
        st1w    z31.s, p7, [x10, x0, lsl 2]
        add     x8, x8, x23
        add     x0, x0, x12
        whilelo p7.s, w0, w9
        b.any   .L4
llvmbot commented 9 months ago

@llvm/issue-subscribers-backend-aarch64

Author: m-saito-fj (m-saito-fj)

Clang cannot SVE vectorize TSVC s114, but GCC13.2.0 can. Option: `-Ofast -march=armv8.2-a+sve` ```c #define LEN 32000 #define LEN2 256 static int ntimes = 200000; float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN]; float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2]; int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN], float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float); int s114() { for (int nl = 0; nl < 200*(ntimes/(LEN2)); nl++) { for (int i = 0; i < LEN2; i++) { for (int j = 0; j < i; j++) { aa[i][j] = aa[j][i] + bb[i][j]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return 0; } ``` See also (Clang vs GCC): https://godbolt.org/z/frcoG3xz5 for.body IR: ```llvm for.body8: ; preds = %for.body8.preheader, %for.body8 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body8 ], [ 0, %for.body8.preheader ] %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv40, !dbg !27 %0 = load float, ptr %arrayidx10, align 4, !dbg !27, !tbaa !28 %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv40, i64 %indvars.iv, !dbg !32 %1 = load float, ptr %arrayidx14, align 4, !dbg !32, !tbaa !28 %add = fadd fast float %1, %0, !dbg !33 %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv40, i64 %indvars.iv, !dbg !34 store float %add, ptr %arrayidx18, align 4, !dbg !35, !tbaa !28 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !36 %exitcond.not = icmp eq i64 %indvars.iv.next, %indvars.iv40, !dbg !15 br i1 %exitcond.not, label %for.cond.cleanup7.loopexit, label %for.body8, !dbg !16, !llvm.loop !37 ``` `-mllvm -debug-only=loop-accesses` messages: ``` LAA: Found a runtime check ptr: %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv40, i64 %indvars.iv, !dbg !34 LAA: Found a runtime check ptr: %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv40, !dbg !27 LAA: We need to do 0 pointer comparisons. LAA: May be able to perform a memory runtime check if needed. LAA: Checking memory dependencies LAA: Src Scev: {{@aa,+,4}<nw><%for.cond5.preheader>,+,1024}<nuw><%for.body8>Sink Scev: {{@aa,+,1024}<nw><%for.cond5.preheader>,+,4}<nuw><%for.body8>(Induction step: 256) LAA: Distance for %0 = load float, ptr %arrayidx10, align 4, !dbg !27, !tbaa !28 to store float %add, ptr %arrayidx18, align 4, !dbg !35, !tbaa !28: {{0,+,1020}<%for.cond5.preheader>,+,-1020}<%for.body8> Pointer access with non-constant stride Total Dependences: 1 LAA: unsafe dependent memory operations in loop ``` The direct factor seems to be that the distance between the load of a[j][i] and the store of a[i][j] is not constant. On the other hand, GCC13 is able to vectorize by vector masked load/store. ```asm .L4: ld1w z31.s, p7/z, [x8, z29.s, uxtw] ld1w z30.s, p7/z, [x11, x0, lsl 2] fadd z31.s, z31.s, z30.s st1w z31.s, p7, [x10, x0, lsl 2] add x8, x8, x23 add x0, x0, x12 whilelo p7.s, w0, w9 b.any .L4 ```
aemerson commented 9 months ago

cc @fhahn

efriedma-quic commented 9 months ago

"unsafe dependent memory operations" means it can't disambiguate that aa[i][j] and aa[j][i] can't alias. Figuring that out requires reasoning about the relationship between i and j (specifically, that i<LEN2 and j<i implies j*LEN2+i<i*LEN2).

In non-SVE mode, it looks like gcc concludes it's legal to vectorize, but refuses because it doesn't know how to generate the strided load.

If you adjust the loop to avoid the aliasing issue (change "aa" to "cc"), LLVM's vectorizer can handle the loop, whether or not the target has gather loads. (If it doesn't, it just generates the load element by element.) The cost computation is a bit weird, though: it prefers to use 64-bit NEON registers. And if you force it to use SVE, it generates the gather in an inefficient way.

mrdaybird commented 4 months ago

I was looking into this issue. I find this test case very important because if you can vectorize this code you can vectorize matrix transpose, which is heavily used in ML and linear algebra use cases. Also, I feel this may be a case which is only profitable to vectorize for the scalable vectors.

mrdaybird commented 4 months ago

Simplified test case: https://godbolt.org/z/d3rf1brha Corresponding LLVM IR: https://llvm.godbolt.org/z/x6e91hEjE

@aa = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 4

define dso_local void @s114_simplified() local_unnamed_addr {
entry:
  br label %for.cond1.preheader

for.cond1.preheader:
  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
  %cmp221.not = icmp eq i64 %indvars.iv25, 0
  br i1 %cmp221.not, label %for.cond.cleanup3, label %for.body4

for.cond.cleanup:
  ret void

for.cond.cleanup3:
  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
  %exitcond28.not = icmp eq i64 %indvars.iv.next26, 256
  br i1 %exitcond28.not, label %for.cond.cleanup, label %for.cond1.preheader

for.body4:
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.cond1.preheader ]
  %arrayidx6 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv25
  %0 = load float, ptr %arrayidx6, align 4
  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv25, i64 %indvars.iv
  store float %0, ptr %arrayidx10, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %indvars.iv25
  br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
}
mrdaybird commented 4 months ago

@m-saito-fj Can you benchmark this with SVE(gcc vs clang)? I was interested to know if there is any speedup at all, as strided accesses can sometimes be not profitable to vectorize.

m-saito-fj commented 4 months ago

I measured the kernel section as follows.

start_t = clock();
for (int nl = 0; nl < 200*(ntimes/(LEN2)); nl++) {
       for (int i = 0; i < LEN2; i++) {
                for (int j = 0; j < i; j++) {
                        aa[i][j] = aa[j][i] + bb[i][j];
                }
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
end_t = clock(); clock_dif = end_t - start_t;
clock_dif_sec = (double) (clock_dif/1000000.0);
printf("S114\t %.2f \t\t", clock_dif_sec);;

CPU: Grace Compiler option: -Ofast -march=armv9-a

Result: compiler time(s)
gcc13.2.1 3.07
clang18.1.7 3.38

Assembly code: https://godbolt.org/z/d767P696d Result of using compilers with different minor versions, but the kernel part does not differ from that used in the measurement.

mrdaybird commented 4 months ago

After looking into this, I think there are multiple things that need to be cleared up for this optimization to work:

  1. LAA doesn't handle cases where distance between source and sink is not constant(like addrec). This can be handled easily, but for this particular case requires some tricks to make it work. I believe this will be better handled when mlir-presburger gets used in the future.

  2. LAA fails when dependence is positive and there is no common stride between source and sink. We could make this work. GCC vectorizes this case, but for the cases that I tested the benchmark shows that it is not profitable and scalar code is much much faster, though this particular test case show some speedup.

  3. After tackling all the above issues, the final issue is that the MinDistanceNeeded(1028 bytes) for vectorization of the load is greater than minimum dependence distance(1020). But somehow gcc vectorizes this? (only for SVE ). I am yet to analyze this issue but somehow gcc finds it legal to vectorize this case even though the max-safe VF is 0.

    gentle ping to @efriedma-quic @fhahn for thoughts on the last point.