[VPlan] Report "Assertion `!State->VF.isScalable() && "VF is assumed to be non scalable."' failed"

eastB233 commented 1 month ago

The IR is put at the end.

Compile command is opt -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue

The error is

opt: /root/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp:734: virtual void llvm::VPRegionBlock::execute(llvm::VPTransformState*): Assertion `!State->VF.isScalable() && "VF is assumed to be non scalable."' failed.

It can be seen at https://godbolt.org/z/s4bqzdKPP

; ModuleID = 'test.cpp'
source_filename = "test.cpp"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"

%struct.ident_t = type { i32, i32, i32, i32, ptr }

@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @0 }, align 8
@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8

; Function Attrs: mustprogress nounwind uwtable vscale_range(1,16)
define dso_local void @_Z4testiiPdS_(i32 noundef %nx, i32 noundef %ik, ptr noundef %out, ptr noundef %rspace) local_unnamed_addr #0 {
entry:
  %nx.addr = alloca i32, align 4
  %ik.addr = alloca i32, align 4
  %out.addr = alloca ptr, align 8
  %rspace.addr = alloca ptr, align 8
  store i32 %nx, ptr %nx.addr, align 4
  store i32 %ik, ptr %ik.addr, align 4
  store ptr %out, ptr %out.addr, align 8
  store ptr %rspace, ptr %rspace.addr, align 8
  call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull @2, i32 4, ptr nonnull @_Z4testiiPdS_.omp_outlined, ptr nonnull %nx.addr, ptr nonnull %ik.addr, ptr nonnull %out.addr, ptr nonnull %rspace.addr)
  ret void
}

; Function Attrs: alwaysinline norecurse nounwind uwtable vscale_range(1,16)
define internal void @_Z4testiiPdS_.omp_outlined(ptr noalias nocapture noundef readonly %.global_tid., ptr noalias nocapture noundef readnone %.bound_tid., ptr noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) %nx, ptr noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) %ik, ptr noalias nocapture noundef nonnull readonly align 8 dereferenceable(8) %out, ptr noalias nocapture noundef nonnull readonly align 8 dereferenceable(8) %rspace) #1 {
entry:
  %.omp.lb = alloca i64, align 8
  %.omp.ub = alloca i64, align 8
  %.omp.stride = alloca i64, align 8
  %.omp.is_last = alloca i32, align 4
  %0 = load i32, ptr %nx, align 4
  %1 = load i32, ptr %ik, align 4
  %cmp = icmp sgt i32 %0, 0
  %cmp8 = icmp sgt i32 %1, 0
  %or.cond = select i1 %cmp, i1 %cmp8, i1 false
  br i1 %or.cond, label %omp.precond.then, label %omp.precond.end

omp.precond.then:                                 ; preds = %entry
  %conv = zext i32 %0 to i64
  %conv6 = zext i32 %1 to i64
  %mul = mul nuw nsw i64 %conv6, %conv
  %sub7 = add nsw i64 %mul, -1
  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %.omp.lb) #3
  store i64 0, ptr %.omp.lb, align 8
  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %.omp.ub) #3
  store i64 %sub7, ptr %.omp.ub, align 8
  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %.omp.stride) #3
  store i64 1, ptr %.omp.stride, align 8
  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %.omp.is_last) #3
  store i32 0, ptr %.omp.is_last, align 4
  %2 = load i32, ptr %.global_tid., align 4
  call void @__kmpc_for_static_init_8(ptr nonnull @1, i32 %2, i32 33, ptr nonnull %.omp.is_last, ptr nonnull %.omp.lb, ptr nonnull %.omp.ub, ptr nonnull %.omp.stride, i64 1, i64 512)
  %3 = load i64, ptr %.omp.ub, align 8
  %cond60 = call i64 @llvm.smin.i64(i64 %3, i64 %sub7)
  store i64 %cond60, ptr %.omp.ub, align 8
  %4 = load i64, ptr %.omp.lb, align 8
  %cmp12.not61 = icmp sgt i64 %4, %cond60
  br i1 %cmp12.not61, label %omp.dispatch.end, label %omp.inner.for.cond.preheader.lr.ph

omp.inner.for.cond.preheader.lr.ph:               ; preds = %omp.precond.then
  br label %omp.inner.for.cond.preheader

omp.inner.for.cond.preheader:                     ; preds = %omp.inner.for.cond.preheader.lr.ph, %omp.dispatch.inc
  %5 = phi i64 [ %4, %omp.inner.for.cond.preheader.lr.ph ], [ %add42, %omp.dispatch.inc ]
  %cond62 = phi i64 [ %cond60, %omp.inner.for.cond.preheader.lr.ph ], [ %cond, %omp.dispatch.inc ]
  %smax = call i64 @llvm.smax.i64(i64 %cond62, i64 %5)
  %6 = add i64 %smax, 1
  %7 = load ptr, ptr %rspace, align 8
  %8 = load ptr, ptr %out, align 8
  br label %omp.inner.for.body

omp.inner.for.body:                               ; preds = %omp.inner.for.cond.preheader, %omp.inner.for.body
  %.omp.iv.059 = phi i64 [ %5, %omp.inner.for.cond.preheader ], [ %add41, %omp.inner.for.body ]
  %div18 = sdiv i64 %.omp.iv.059, %conv6
  %conv20 = trunc i64 %div18 to i32
  %mul30 = mul nsw i64 %div18, %conv6
  %sub31 = sub nsw i64 %.omp.iv.059, %mul30
  %conv34 = trunc i64 %sub31 to i32
  %mul35 = mul nsw i32 %1, %conv20
  %add36 = add nsw i32 %mul35, %conv34
  %idxprom = sext i32 %add36 to i64
  %arrayidx = getelementptr inbounds double, ptr %7, i64 %idxprom
  %9 = load double, ptr %arrayidx, align 8
  %arrayidx40 = getelementptr inbounds double, ptr %8, i64 %idxprom
  store double %9, ptr %arrayidx40, align 8
  %add41 = add i64 %.omp.iv.059, 1
  %exitcond = icmp ne i64 %add41, %6
  br i1 %exitcond, label %omp.inner.for.body, label %omp.dispatch.inc

omp.dispatch.inc:                                 ; preds = %omp.inner.for.body
  %10 = load i64, ptr %.omp.stride, align 8
  %add42 = add nsw i64 %10, %5
  store i64 %add42, ptr %.omp.lb, align 8
  %add43 = add nsw i64 %10, %cond62
  %cond = call i64 @llvm.smin.i64(i64 %add43, i64 %sub7)
  store i64 %cond, ptr %.omp.ub, align 8
  %cmp12.not = icmp sgt i64 %add42, %cond
  br i1 %cmp12.not, label %omp.dispatch.cond.omp.dispatch.end_crit_edge, label %omp.inner.for.cond.preheader

omp.dispatch.cond.omp.dispatch.end_crit_edge:     ; preds = %omp.dispatch.inc
  br label %omp.dispatch.end

omp.dispatch.end:                                 ; preds = %omp.dispatch.cond.omp.dispatch.end_crit_edge, %omp.precond.then
  call void @__kmpc_for_static_fini(ptr nonnull @1, i32 %2)
  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %.omp.is_last) #3
  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %.omp.stride) #3
  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %.omp.ub) #3
  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %.omp.lb) #3
  br label %omp.precond.end

omp.precond.end:                                  ; preds = %omp.dispatch.end, %entry
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2

; Function Attrs: nounwind
declare void @__kmpc_for_static_init_8(ptr, i32, i32, ptr, ptr, ptr, ptr, i64, i64) local_unnamed_addr #3

; Function Attrs: nounwind
declare void @__kmpc_for_static_fini(ptr, i32) local_unnamed_addr #3

; Function Attrs: nounwind
declare void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr #3

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.smin.i64(i64, i64) #4

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.smax.i64(i64, i64) #4

attributes #0 = { mustprogress nounwind uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+v8a,-fmv" }
attributes #1 = { alwaysinline norecurse nounwind uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+v8a,-fmv" }
attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #3 = { nounwind }
attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

eastB233 commented 1 month ago

Simpler test case is

extern const int npy;
extern const int nx;
double* rspace;

void recip2real(double* out, const double factor)
{
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
        for (int ix = 0; ix < nx; ++ix)
        {
            for (int ipy = 0; ipy < npy; ++ipy) {
                out[ix * npy + ipy] += factor * rspace[ix * npy + ipy];
            }
        }
}

Command is clang -O3 -march=armv9-a -fopenmp -mllvm -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S

It can be seen at https://godbolt.org/z/6MGsecEr5

eastB233 commented 1 month ago

I don't know much about VPlan, and as far as I understand by tracing the code,

first, it fails at the following VPlan

VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' {
vp<%2> = original trip-count

ph:
  EMIT vp<%2> = EXPAND SCEV (1 + (-1 * %5) + ((-1 + ((zext i32 %0 to i64) * (sext i32 %1 to i64)))<nsw> smin %4))
No successors

vector.ph:
  EMIT vp<%3> = TC > VF ? TC - VF : 0 vp<%2>
  EMIT vp<%4> = VF * Part + ir<0>
  EMIT vp<%5> = active lane mask vp<%4>, vp<%2>
Successor(s): vector loop

<x1> vector loop: {
  vector.body:
    EMIT vp<%6> = CANONICAL-INDUCTION
    ACTIVE-LANE-MASK-PHI vp<%7> = phi vp<%5>, vp<%27>
    vp<%8>    = DERIVED-IV ir<%5> + vp<%6> * ir<1>
    vp<%9>    = SCALAR-STEPS vp<%8>, ir<1>
  Successor(s): pred.sdiv

  <xVFxUF> pred.sdiv: {
    pred.sdiv.entry:
      BRANCH-ON-MASK vp<%7>
    Successor(s): pred.sdiv.if, pred.sdiv.continue

    pred.sdiv.if:
      CLONE ir<%div24> = sdiv vp<%9>, ir<%conv6>
    Successor(s): pred.sdiv.continue

    pred.sdiv.continue:
      PHI-PREDICATED-INSTRUCTION vp<%11> = ir<%div24>
    No successors
  }
  Successor(s): omp.inner.for.body.0

  omp.inner.for.body.0:
    CLONE ir<%conv26> = trunc vp<%11>
    CLONE ir<%mul36> = mul nsw vp<%11>, ir<%conv6>
    CLONE ir<%sub37> = sub nsw vp<%9>, ir<%mul36>
    CLONE ir<%conv40> = trunc ir<%sub37>
    CLONE ir<%mul41> = mul nsw ir<%1>, ir<%conv26>
    CLONE ir<%add42> = add nsw ir<%mul41>, ir<%conv40>
    CLONE ir<%idxprom> = sext ir<%add42>
    CLONE ir<%arrayidx> = getelementptr inbounds ir<%6>, ir<%idxprom>
    WIDEN ir<%8> = load ir<%arrayidx>, vp<%7>
    WIDEN ir<%mul43> = fmul contract ir<%2>, ir<%8>
    CLONE ir<%arrayidx47> = getelementptr inbounds ir<%7>, ir<%idxprom>
    WIDEN ir<%9> = load ir<%arrayidx47>, vp<%7>
    WIDEN ir<%add48> = fadd contract ir<%mul43>, ir<%9>
    WIDEN store ir<%arrayidx47>, ir<%add48>, vp<%7>
    EMIT vp<%25> = VF * UF + vp<%6>
    EMIT vp<%26> = VF * Part + vp<%6>
    EMIT vp<%27> = active lane mask vp<%26>, vp<%3>
    EMIT vp<%28> = not vp<%27>
    EMIT branch-on-cond vp<%28>
  No successors
}
Successor(s): middle.block

VPRegion pred.sdiv fails at assertion

void VPRegionBlock::execute(VPTransformState *State) {
...
  if (!isReplicator()) {
...
    return;
  }
...
  for (...) {
    assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
  }
}

I think VPRegion pred.sdiv should have isReplicator() == false or pred.sdiv just should not exist.

second, I find VPRegion `pred.sdiv` is splitted from `WorkList`

static void addReplicateRegions(VPlan &Plan) {
  SmallVector<VPReplicateRecipe *> WorkList;
  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
           vp_depth_first_deep(Plan.getEntry()))) {
    for (VPRecipeBase &R : *VPBB)
      if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
        if (RepR->isPredicated())
          WorkList.push_back(RepR);
      }
  }
...
}

by VPRecipe CLONE ir<%div24> = sdiv ir<%.omp.iv.065>, ir<%conv6>, vp<%7>, where the instruction is %div24 = sdiv i64 %.omp.iv.065, %conv6 I think this VPRecipe should have isPredicated() == false here, so it will not be splitted.

third, I find this VPRecipe is created here

VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(...) {
  bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
      [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
      Range);

  bool IsPredicated = CM.isPredicatedInst(I);
...
  VPValue *BlockInMask = nullptr;
  if (!IsPredicated) {
    // Finalize the recipe for Instr, first if it is not predicated.
    LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
  } else {
    LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
    // Instructions marked for predication are replicated and a mask operand is
    // added initially. Masked replicate recipes will later be placed under an
    // if-then construct to prevent side-effects. Generate recipes to compute
    // the block mask for this region.
    BlockInMask = createBlockInMask(I->getParent(), Plan);
  }

  auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
                                       IsUniform, BlockInMask);
  return toVPRecipeResult(Recipe);
}

I notice that instruction I (%div24 = sdiv i64 %.omp.iv.065, %conv6) do not need to vectorize, because function isScalarAfterVectorization returns true and I is just used to calculate the index. It seems reasonable that a scalar instruction does not need Predicated.

So I try the following modification,

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 907b8ce002e8..76a5704a61c5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9819,7 +9819,9 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
       Range);

-  bool IsPredicated = CM.isPredicatedInst(I);
+  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+      [&](ElementCount VF) { return CM.isPredicatedInst(I) && !CM.isScalarAfterVectorization(I, VF); },
+      Range);

   // Even if the instruction is not marked as uniform, there are certain
   // intrinsic calls that can be effectively treated as such, so we check for

Just my guess, I'm not sure if it is correct direction.

eastB233 commented 1 month ago

Ping @fhahn

eastB233 commented 1 month ago

I think I misunderstand something, and the changes above may be wrong.

And I have another way. In instruction %div24 = sdiv i64 %.omp.iv.065, %conv6, %conv6 is invariant in loop, so it seems we do not need Predicated.

I try the following patch,

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c7c19ef456c7..f294703e1529 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3856,21 +3856,21 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
         !Legal->blockNeedsPredication(I->getParent()))
       return false;
     return true;
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
     // TODO: We can use the loop-preheader as context point here and get
     // context sensitive reasoning
-    return !isSafeToSpeculativelyExecute(I);
+    return !isSafeToSpeculativelyExecute(I) && !Legal->isInvariant(I->getOperand(1));
   case Instruction::Call:
     return Legal->isMaskRequired(I);
   }
 }

 std::pair<InstructionCost, InstructionCost>
 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
                                                     ElementCount VF) const {
   assert(I->getOpcode() == Instruction::UDiv ||
          I->getOpcode() == Instruction::SDiv ||

eastB233 commented 2 weeks ago

ping @sdesmalen-arm @davemgreen @paulwalker-arm

fhahn commented 2 weeks ago

@eastB233 unfortunately I don't think this change is correct, e.g. consider https://github.com/llvm/llvm-project/blob/967eba07549d64f15e7a91e798aa46214704f62b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll#L251 when the sdiv/udiv is executed conditionally in the loop

llvm / llvm-project