arith.floordivsi won't lower to llvm with vector.load'ed vectors

alankarmisra commented 3 months ago

In the following example, I arith.addi and arith.floordivsi with constant vectors and vectors loaded from a memref. The add will succeed in lowering to llvm in both situations but floordivsi fails to lower to llvm with vectors loaded from memrefs for some reason.

module @bug {
  func.func @main() {
    %number = arith.constant dense<10> : vector<1xi32>
    %resolved_floordivsi = arith.floordivsi %number, %number : vector<1xi32>
    %alloca = memref.alloca() : memref<1xi32>
    %idx = arith.constant 0 : index
    vector.store %number, %alloca[%idx] : memref<1xi32>, vector<1xi32>
    %loadedNum = vector.load %alloca[%idx] : memref<1xi32>, vector<1xi32>
    %unresolved_floordivsi = arith.floordivsi %loadedNum, %number: vector<1xi32>    
    vector.print %resolved_floordivsi : vector<1xi32>
    vector.print %unresolved_floordivsi : vector<1xi32>
    %resolved_add = arith.addi %number, %number : vector<1xi32>
    %resolved_add_too = arith.addi %loadedNum, %number : vector<1xi32>
    vector.print %resolved_add : vector<1xi32>
    vector.print %resolved_add_too : vector<1xi32>
    return
  }
}

Running

mlir-opt input.mlir \
 -lower-affine \
 -convert-vector-to-llvm \
 -convert-arith-to-llvm \
 -convert-func-to-llvm \
 -convert-index-to-llvm \
 --convert-to-llvm \
 -o out.mlir

gives me

 module @bug {
  llvm.func @main() {
    %0 = llvm.mlir.constant(dense<20> : vector<1xi32>) : vector<1xi32>
    %1 = llvm.mlir.constant(0 : index) : i64
    %2 = builtin.unrealized_conversion_cast %1 : i64 to index
    %3 = llvm.mlir.constant(dense<10> : vector<1xi32>) : vector<1xi32>
    %4 = llvm.mlir.constant(dense<1> : vector<1xi32>) : vector<1xi32>
    %5 = llvm.mlir.constant(1 : index) : i64
    %6 = llvm.mlir.constant(1 : index) : i64
    %7 = llvm.alloca %5 x i32 : (i64) -> !llvm.ptr
    %8 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
    %9 = llvm.insertvalue %7, %8[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %10 = llvm.insertvalue %7, %9[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %11 = llvm.mlir.constant(0 : index) : i64
    %12 = llvm.insertvalue %11, %10[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %13 = llvm.insertvalue %5, %12[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %14 = llvm.insertvalue %6, %13[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %15 = llvm.mlir.constant(0 : i64) : i64
    %16 = llvm.extractelement %3[%15 : i64] : vector<1xi32>
    %17 = llvm.extractvalue %14[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %18 = llvm.getelementptr %17[%1] : (!llvm.ptr, i64) -> !llvm.ptr, i32
    llvm.store %16, %18 : i32, !llvm.ptr
    %19 = llvm.extractvalue %14[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %20 = llvm.getelementptr %19[%1] : (!llvm.ptr, i64) -> !llvm.ptr, i32
    %21 = llvm.load %20 : !llvm.ptr -> i32
    %22 = llvm.mlir.undef : vector<1xi32>
    %23 = llvm.mlir.constant(0 : i32) : i32
    %24 = llvm.insertelement %21, %22[%23 : i32] : vector<1xi32>
    %25 = llvm.shufflevector %24, %22 [0] : vector<1xi32> 
    %26 = arith.floordivsi %25, %3 : vector<1xi32>
    vector.print %4 : vector<1xi32>
    vector.print %26 : vector<1xi32>
    %27 = llvm.add %25, %3  : vector<1xi32>
    vector.print %0 : vector<1xi32>
    vector.print %27 : vector<1xi32>
    llvm.return
  }
}

%4 is the result for the first arith.floordivsi for (10, 10) and correctly folds to 1 %26 is the result for the second arith.floordivsi and does not lower to llvm %0 is the result for the first arith.add for (10, 10) and correctly folds to 20 %27 is the result for the second arith.add and lowers to llvm.add

mlir-opt --version
Homebrew LLVM version 18.1.8
  Optimized build.

llvmbot commented 3 months ago

@llvm/issue-subscribers-mlir

Author: Alankar Misra (alankarmisra)

In the following example, I arith.addi and arith.floordivsi with constant vectors and vectors loaded from a memref. The add will succeed in lowering to llvm in both situations but floordivsi fails to lower to llvm with vectors loaded from memrefs for some reason. ```mlir module @bug { func.func @main() { %number = arith.constant dense<10> : vector<1xi32> %resolved_floordivsi = arith.floordivsi %number, %number : vector<1xi32> %alloca = memref.alloca() : memref<1xi32> %idx = arith.constant 0 : index vector.store %number, %alloca[%idx] : memref<1xi32>, vector<1xi32> %loadedNum = vector.load %alloca[%idx] : memref<1xi32>, vector<1xi32> %unresolved_floordivsi = arith.floordivsi %loadedNum, %number: vector<1xi32> vector.print %resolved_floordivsi : vector<1xi32> vector.print %unresolved_floordivsi : vector<1xi32> %resolved_add = arith.addi %number, %number : vector<1xi32> %resolved_add_too = arith.addi %loadedNum, %number : vector<1xi32> vector.print %resolved_add : vector<1xi32> vector.print %resolved_add_too : vector<1xi32> return } } ``` Running ```bash mlir-opt input.mlir \ -lower-affine \ -convert-vector-to-llvm \ -convert-arith-to-llvm \ -convert-func-to-llvm \ -convert-index-to-llvm \ --convert-to-llvm \ -o out.mlir ``` gives me ```mlir module @bug { llvm.func @main() { %0 = llvm.mlir.constant(dense<20> : vector<1xi32>) : vector<1xi32> %1 = llvm.mlir.constant(0 : index) : i64 %2 = builtin.unrealized_conversion_cast %1 : i64 to index %3 = llvm.mlir.constant(dense<10> : vector<1xi32>) : vector<1xi32> %4 = llvm.mlir.constant(dense<1> : vector<1xi32>) : vector<1xi32> %5 = llvm.mlir.constant(1 : index) : i64 %6 = llvm.mlir.constant(1 : index) : i64 %7 = llvm.alloca %5 x i32 : (i64) -> !llvm.ptr %8 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %9 = llvm.insertvalue %7, %8[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %10 = llvm.insertvalue %7, %9[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %11 = llvm.mlir.constant(0 : index) : i64 %12 = llvm.insertvalue %11, %10[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %13 = llvm.insertvalue %5, %12[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %14 = llvm.insertvalue %6, %13[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %15 = llvm.mlir.constant(0 : i64) : i64 %16 = llvm.extractelement %3[%15 : i64] : vector<1xi32> %17 = llvm.extractvalue %14[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %18 = llvm.getelementptr %17[%1] : (!llvm.ptr, i64) -> !llvm.ptr, i32 llvm.store %16, %18 : i32, !llvm.ptr %19 = llvm.extractvalue %14[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %20 = llvm.getelementptr %19[%1] : (!llvm.ptr, i64) -> !llvm.ptr, i32 %21 = llvm.load %20 : !llvm.ptr -> i32 %22 = llvm.mlir.undef : vector<1xi32> %23 = llvm.mlir.constant(0 : i32) : i32 %24 = llvm.insertelement %21, %22[%23 : i32] : vector<1xi32> %25 = llvm.shufflevector %24, %22 [0] : vector<1xi32> %26 = arith.floordivsi %25, %3 : vector<1xi32> vector.print %4 : vector<1xi32> vector.print %26 : vector<1xi32> %27 = llvm.add %25, %3 : vector<1xi32> vector.print %0 : vector<1xi32> vector.print %27 : vector<1xi32> llvm.return } } ``` %4 is the result for the first arith.floordivsi for (10, 10) and correctly folds to 1 %26 is the result for the second arith.floordivsi and does not lower to llvm %0 is the result for the first arith.add for (10, 10) and correctly folds to 20 %27 is the result for the second arith.add and lowers to llvm.add ```bash mlir-opt --version Homebrew LLVM version 18.1.8 Optimized build. ```

chadlonso commented 1 month ago

There's no direct lowering from arith.floordivsi to llvm. Instead use the FloorDivSIOpConverter pattern in mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp

llvm / llvm-project

arith.floordivsi won't lower to llvm with vector.load'ed vectors #100146