llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.97k stars 11.54k forks source link

LoopVectorizePass is not working for riscv #101456

Closed 4465464 closed 1 month ago

4465464 commented 1 month ago

In the following case, the assembly is generated for both X86 and RISC-V using clang. By using -mllvm -print-after-all to print the IR after all passes, it is observed that X86 generates vectorized IR instructions, but RISC-V does not generate vectorized instructions, and the LoopVectorizePass does not take effect.

The case:

include

define N 1024

int a[N2] ; int b[N2] ; int c[N2] ; int d[N2] ;

void example1 () { int i; for (i=0; i<256; i++){ a[i] = b[i] + c[i]; } }

The compilation command is: clang test_loop.c -O3 --target=x86_64-unknown-linux-gnu -mllvm -print-after-all clang test_loop.c -O3 --target=riscv64-unknown-linux-gnu -mllvm -print-after-all

For X86, after the LoopVectorizePass, the LLVM IR generates vectorized instructions. However, for RISC-V, after the LoopVectorizePass, the LLVM IR remains unchanged.

The LLVM IR of x86 after the LoopVectorizePass is:

; IR Dump After InjectTLIMappings on example1 ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { entry: br label %for.body

for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %arrayidx = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %indvars.iv %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 %arrayidx2 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %indvars.iv %1 = load i32, ptr %arrayidx2, align 4, !tbaa !5 %add = add nsw i32 %1, %0 %arrayidx4 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv store i32 %add, ptr %arrayidx4, align 4, !tbaa !5 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, 256 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !9

for.end: ; preds = %for.body ret void } ; IR Dump After LoopVectorizePass on example1 ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { entry: br i1 false, label %scalar.ph, label %vector.ph

vector.ph: ; preds = %entry br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %0 = add i64 %index, 0 %1 = add i64 %index, 4 %2 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %0 %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %1 %4 = getelementptr inbounds i32, ptr %2, i32 0 %5 = getelementptr inbounds i32, ptr %2, i32 4 %wide.load = load <4 x i32>, ptr %4, align 4, !tbaa !5 %wide.load11 = load <4 x i32>, ptr %5, align 4, !tbaa !5 %6 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %0 %7 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %1 %8 = getelementptr inbounds i32, ptr %6, i32 0 %9 = getelementptr inbounds i32, ptr %6, i32 4 %wide.load12 = load <4 x i32>, ptr %8, align 4, !tbaa !5 %wide.load13 = load <4 x i32>, ptr %9, align 4, !tbaa !5 %10 = add nsw <4 x i32> %wide.load12, %wide.load %11 = add nsw <4 x i32> %wide.load13, %wide.load11 %12 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %0 %13 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %1 %14 = getelementptr inbounds i32, ptr %12, i32 0 %15 = getelementptr inbounds i32, ptr %12, i32 4 store <4 x i32> %10, ptr %14, align 4, !tbaa !5 store <4 x i32> %11, ptr %15, align 4, !tbaa !5 %index.next = add nuw i64 %index, 8 %16 = icmp eq i64 %index.next, 256 br i1 %16, label %middle.block, label %vector.body, !llvm.loop !9

middle.block: ; preds = %vector.body br i1 true, label %for.end, label %scalar.ph

scalar.ph: ; preds = %middle.block, %entry %bc.resume.val = phi i64 [ 256, %middle.block ], [ 0, %entry ] br label %for.body

for.body: ; preds = %scalar.ph, %for.body %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] %arrayidx = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %indvars.iv %17 = load i32, ptr %arrayidx, align 4, !tbaa !5 %arrayidx2 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %indvars.iv %18 = load i32, ptr %arrayidx2, align 4, !tbaa !5 %add = add nsw i32 %18, %17 %arrayidx4 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv store i32 %add, ptr %arrayidx4, align 4, !tbaa !5 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, 256 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !13

for.end: ; preds = %middle.block, %for.body ret void }

The LLVM IR of Riscv after the LoopVectorizePass is:

; IR Dump After InjectTLIMappings on example1 ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { br label %1

1: ; preds = %0, %1 %2 = phi i64 [ 0, %0 ], [ %9, %1 ] %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %2 %4 = load i32, ptr %3, align 4, !tbaa !9 %5 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %2 %6 = load i32, ptr %5, align 4, !tbaa !9 %7 = add nsw i32 %6, %4 %8 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %2 store i32 %7, ptr %8, align 4, !tbaa !9 %9 = add nuw nsw i64 %2, 1 %10 = icmp eq i64 %9, 256 br i1 %10, label %11, label %1, !llvm.loop !13

11: ; preds = %1 ret void } ; IR Dump After LoopVectorizePass on example1 ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { br label %1

1: ; preds = %0, %1 %2 = phi i64 [ 0, %0 ], [ %9, %1 ] %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %2 %4 = load i32, ptr %3, align 4, !tbaa !9 %5 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %2 %6 = load i32, ptr %5, align 4, !tbaa !9 %7 = add nsw i32 %6, %4 %8 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %2 store i32 %7, ptr %8, align 4, !tbaa !9 %9 = add nuw nsw i64 %2, 1 %10 = icmp eq i64 %9, 256 br i1 %10, label %11, label %1, !llvm.loop !13

11: ; preds = %1 ret void }

It can be observed that x86 modifies LLVM IR, but why is there no vectorization of LLVM IR for RISC-V?

topperc commented 1 month ago

Did you try with -march=rv64gcv? RISC-V doesn't always have vector support. X86-64 always has at least SSE2.

llvmbot commented 1 month ago

@llvm/issue-subscribers-backend-risc-v

Author: None (4465464)

In the following case, the assembly is generated for both X86 and RISC-V using clang. By using -mllvm -print-after-all to print the IR after all passes, it is observed that X86 generates vectorized IR instructions, but RISC-V does not generate vectorized instructions, and the LoopVectorizePass does not take effect. The case: #include <stdlib.h> #define N 1024 int a[N*2] ; int b[N*2] ; int c[N*2] ; int d[N*2] ; void example1 () { int i; for (i=0; i<256; i++){ a[i] = b[i] + c[i]; } } The compilation command is: clang test_loop.c -O3 --target=x86_64-unknown-linux-gnu -mllvm -print-after-all clang test_loop.c -O3 --target=riscv64-unknown-linux-gnu -mllvm -print-after-all For X86, after the LoopVectorizePass, the LLVM IR generates vectorized instructions. However, for RISC-V, after the LoopVectorizePass, the LLVM IR remains unchanged. The LLVM IR of x86 after the LoopVectorizePass is: ; *** IR Dump After InjectTLIMappings on example1 *** ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { entry: br label %for.body for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %arrayidx = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %indvars.iv %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 %arrayidx2 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %indvars.iv %1 = load i32, ptr %arrayidx2, align 4, !tbaa !5 %add = add nsw i32 %1, %0 %arrayidx4 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv store i32 %add, ptr %arrayidx4, align 4, !tbaa !5 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, 256 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !9 for.end: ; preds = %for.body ret void } ; *** IR Dump After LoopVectorizePass on example1 *** ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { entry: br i1 false, label %scalar.ph, label %vector.ph vector.ph: ; preds = %entry br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %0 = add i64 %index, 0 %1 = add i64 %index, 4 %2 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %0 %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %1 %4 = getelementptr inbounds i32, ptr %2, i32 0 %5 = getelementptr inbounds i32, ptr %2, i32 4 %wide.load = load <4 x i32>, ptr %4, align 4, !tbaa !5 %wide.load11 = load <4 x i32>, ptr %5, align 4, !tbaa !5 %6 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %0 %7 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %1 %8 = getelementptr inbounds i32, ptr %6, i32 0 %9 = getelementptr inbounds i32, ptr %6, i32 4 %wide.load12 = load <4 x i32>, ptr %8, align 4, !tbaa !5 %wide.load13 = load <4 x i32>, ptr %9, align 4, !tbaa !5 %10 = add nsw <4 x i32> %wide.load12, %wide.load %11 = add nsw <4 x i32> %wide.load13, %wide.load11 %12 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %0 %13 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %1 %14 = getelementptr inbounds i32, ptr %12, i32 0 %15 = getelementptr inbounds i32, ptr %12, i32 4 store <4 x i32> %10, ptr %14, align 4, !tbaa !5 store <4 x i32> %11, ptr %15, align 4, !tbaa !5 %index.next = add nuw i64 %index, 8 %16 = icmp eq i64 %index.next, 256 br i1 %16, label %middle.block, label %vector.body, !llvm.loop !9 middle.block: ; preds = %vector.body br i1 true, label %for.end, label %scalar.ph scalar.ph: ; preds = %middle.block, %entry %bc.resume.val = phi i64 [ 256, %middle.block ], [ 0, %entry ] br label %for.body for.body: ; preds = %scalar.ph, %for.body %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] %arrayidx = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %indvars.iv %17 = load i32, ptr %arrayidx, align 4, !tbaa !5 %arrayidx2 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %indvars.iv %18 = load i32, ptr %arrayidx2, align 4, !tbaa !5 %add = add nsw i32 %18, %17 %arrayidx4 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv store i32 %add, ptr %arrayidx4, align 4, !tbaa !5 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, 256 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !13 for.end: ; preds = %middle.block, %for.body ret void } The LLVM IR of Riscv after the LoopVectorizePass is: ; *** IR Dump After InjectTLIMappings on example1 *** ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { br label %1 1: ; preds = %0, %1 %2 = phi i64 [ 0, %0 ], [ %9, %1 ] %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %2 %4 = load i32, ptr %3, align 4, !tbaa !9 %5 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %2 %6 = load i32, ptr %5, align 4, !tbaa !9 %7 = add nsw i32 %6, %4 %8 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %2 store i32 %7, ptr %8, align 4, !tbaa !9 %9 = add nuw nsw i64 %2, 1 %10 = icmp eq i64 %9, 256 br i1 %10, label %11, label %1, !llvm.loop !13 11: ; preds = %1 ret void } ; *** IR Dump After LoopVectorizePass on example1 *** ; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable define dso_local void @example1() local_unnamed_addr #0 { br label %1 1: ; preds = %0, %1 %2 = phi i64 [ 0, %0 ], [ %9, %1 ] %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %2 %4 = load i32, ptr %3, align 4, !tbaa !9 %5 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %2 %6 = load i32, ptr %5, align 4, !tbaa !9 %7 = add nsw i32 %6, %4 %8 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %2 store i32 %7, ptr %8, align 4, !tbaa !9 %9 = add nuw nsw i64 %2, 1 %10 = icmp eq i64 %9, 256 br i1 %10, label %11, label %1, !llvm.loop !13 11: ; preds = %1 ret void } It can be observed that x86 modifies LLVM IR, but why is there no vectorization of LLVM IR for RISC-V?
4465464 commented 1 month ago

Yes, using -march=rv64gcv will yield the correct vectorization instructions.