Closed newling closed 1 month ago
The test is based on the following:
// -----// IR Dump Before AMDAIEDistributeCoresAndObjectFifos (iree-amdaie-distribute-cores-and-objectfifos) //----- //
module {
func.func @f_conv_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x12x12x64x3x3_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c3 = arith.constant 3 : index
%c1 = arith.constant 1 : index
%alloc = memref.alloc() : memref<4x4xi32, 2 : i32>
%alloc_0 = memref.alloc() : memref<1x1x4x1x4xi32, 2 : i32>
%alloc_1 = memref.alloc() : memref<3x3x1x4xi32, 2 : i32>
%alloc_2 = memref.alloc() : memref<1x3x6x1x4xi32, 2 : i32>
%alloc_3 = memref.alloc() : memref<1x1x4x4xi32, 1 : i32>
%0 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x4xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x4xi32, 1 : i32>>
%alloc_4 = memref.alloc() : memref<3x3x4xi32, 1 : i32>
%1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<3x3x4xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<3x3x4xi32, 1 : i32>>
%alloc_5 = memref.alloc() : memref<1x3x6x4xi32, 1 : i32>
%2 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<1x3x6x4xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x3x6x4xi32, 1 : i32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1x14x14x64xi32>
%4 = amdaie.logicalobjectfifo.from_memref %3, {} : memref<1x14x14x64xi32> -> !amdaie.logicalobjectfifo<memref<1x14x14x64xi32>>
memref.assume_alignment %3, 64 : memref<1x14x14x64xi32>
%5 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<3x3x64xi32>
%6 = amdaie.logicalobjectfifo.from_memref %5, {} : memref<3x3x64xi32> -> !amdaie.logicalobjectfifo<memref<3x3x64xi32>>
memref.assume_alignment %5, 64 : memref<3x3x64xi32>
%7 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<1x12x12x64xi32>
%8 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<1x12x12x64xi32> -> !amdaie.logicalobjectfifo<memref<1x12x12x64xi32>>
memref.assume_alignment %7, 64 : memref<1x12x12x64xi32>
scf.forall (%arg0, %arg1, %arg2, %arg3) in (1, 12, 3, 16) {
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
%11 = affine.apply affine_map<(d0) -> (d0)>(%arg1)
%12 = affine.apply affine_map<(d0) -> (d0)>(%arg0)
%13 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 3, 6, 4] [72, 24, 4, 1], %4[%12, %11, %10, %9] [1, 3, 6, 4] [12544, 896, 64, 1]) : (!amdaie.logicalobjectfifo<memref<1x3x6x4xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x14x14x64xi32>>)
%14 = amdaie.dma_cpy_nd(%1[0, 0, 0] [3, 3, 4] [12, 4, 1], %6[0, 0, %9] [3, 3, 4] [192, 64, 1]) : (!amdaie.logicalobjectfifo<memref<3x3x4xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<3x3x64xi32>>)
%15 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x1x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x1x4xi32, 2 : i32>>
%16 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<3x3x1x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<3x3x1x4xi32, 2 : i32>>
%17 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x3x6x1x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x3x6x1x4xi32, 2 : i32>>
scf.forall (%arg4, %arg5, %arg6, %arg7) in (1, 1, 1, 1) {
%19 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg7)
%20 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%21 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
%22 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
%23 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0] [1, 3, 6, 1, 4] [72, 24, 4, 4, 1], %2[%22, %21, %20, 0, %19] [1, 3, 6, 1, 4] [72, 24, 4, 4, 1]) : (!amdaie.logicalobjectfifo<memref<1x3x6x1x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x3x6x4xi32, 1 : i32>>)
%24 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0] [3, 3, 1, 4] [12, 4, 4, 1], %1[0, 0, 0, %19] [3, 3, 1, 4] [12, 4, 4, 1]) : (!amdaie.logicalobjectfifo<memref<3x3x1x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<3x3x4xi32, 1 : i32>>)
%25 = amdaie.dma_cpy_nd(%0[%22, %21, %20, %19] [1, 1, 4, 4] [16, 16, 4, 1], %15[0, 0, 0, 0, 0] [1, 1, 4, 1, 4] [16, 16, 4, 4, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x4xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x1x4xi32, 2 : i32>>)
%c2 = arith.constant 2 : index
%26 = arith.addi %arg4, %c2 : index
%tile = amdaie.tile(%arg5, %26)
%27 = amdaie.core(%tile, in : [%23, %24], out : [%25]) {
linalg.fill ins(%c0_i32 : i32) outs(%alloc_0 : memref<1x1x4x1x4xi32, 2 : i32>)
scf.for %arg8 = %c0 to %c3 step %c1 {
scf.for %arg9 = %c0 to %c3 step %c1 {
%subview = memref.subview %alloc_1[%arg8, %arg9, 0, 0] [1, 1, 1, 4] [1, 1, 1, 1] : memref<3x3x1x4xi32, 2 : i32> to memref<4xi32, strided<[1], offset: ?>, 2 : i32>
%subview_6 = memref.subview %alloc_0[0, 0, 0, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : memref<1x1x4x1x4xi32, 2 : i32> to memref<4x4xi32, strided<[4, 1]>, 2 : i32>
%subview_7 = memref.subview %alloc_2[0, %arg8, %arg9, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : memref<1x3x6x1x4xi32, 2 : i32> to memref<4x4xi32, strided<[4, 1], offset: ?>, 2 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_7, %subview, %subview_6 : memref<4x4xi32, strided<[4, 1], offset: ?>, 2 : i32>, memref<4xi32, strided<[1], offset: ?>, 2 : i32>, memref<4x4xi32, strided<[4, 1]>, 2 : i32>) outs(%alloc : memref<4x4xi32, 2 : i32>) {
^bb0(%in: i32, %in_8: i32, %in_9: i32, %out: i32):
%28 = arith.muli %in, %in_8 : i32
%29 = arith.addi %in_9, %28 : i32
linalg.yield %29 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc : memref<4x4xi32, 2 : i32>) outs(%subview_6 : memref<4x4xi32, strided<[4, 1]>, 2 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
}
}
amdaie.end
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>, #gpu.thread<linear_dim_0>]}
%18 = amdaie.dma_cpy_nd(%8[%12, %11, %10, %9] [1, 1, 4, 4] [9216, 768, 64, 1], %0[0, 0, 0, 0] [1, 1, 4, 4] [16, 16, 4, 1]) : (!amdaie.logicalobjectfifo<memref<1x12x12x64xi32>>, !amdaie.logicalobjectfifo<memref<1x1x4x4xi32, 1 : i32>>)
} {mapping = [#gpu.block<linear_dim_0>, #gpu.block<y>, #gpu.block<x>, #gpu.block<z>]}
memref.dealloc %alloc_5 : memref<1x3x6x4xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<3x3x4xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x4x4xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x3x6x1x4xi32, 2 : i32>
memref.dealloc %alloc_1 : memref<3x3x1x4xi32, 2 : i32>
memref.dealloc %alloc_0 : memref<1x1x4x1x4xi32, 2 : i32>
memref.dealloc %alloc : memref<4x4xi32, 2 : i32>
return
}
}
Answering questions about where this buffer comes from: It gets created by the pass linalg-fold-unit-extent-dims
:
The IR before the pass runs
// -----// IR Dump Before LinalgFoldUnitExtentDimsPass (linalg-fold-unit-extent-dims) //----- //
...
%21 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %20) -> (tensor<1x1x4x1x4xi32>) {
%22 = scf.for %arg12 = %c0 to %c3 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x4x1x4xi32>) {
%extracted_slice_11 = tensor.extract_slice %pack[0, %arg10, %arg12, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : tensor<1x3x6x1x4xi32> to tensor<1x1x4x1x4xi32>
%extracted_slice_12 = tensor.extract_slice %pack_9[%arg10, %arg12, 0, 0] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<3x3x1x4xi32> to tensor<1x1x1x4xi32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x4x1x4xi32>, tensor<1x1x1x4xi32>) outs(%arg13 : tensor<1x1x4x1x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 4, 4, 0, 0], [1, 1, 4, 4, 0, 0], [0, 0, 0, 0, 1, 1, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [0, 0, 0, 4, 0, 0], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[], [], []], outerPerm = [[0, 1, 2, 3], [0, 1, 2], [0, 1, 2, 3]]}]>} {
^bb0(%in: i32, %in_13: i32, %out: i32):
%24 = arith.muli %in, %in_13 : i32
%25 = arith.addi %out, %24 : i32
linalg.yield %25 : i32
} -> tensor<1x1x4x1x4xi32>
scf.yield %23 : tensor<1x1x4x1x4xi32>
}
scf.yield %22 : tensor<1x1x4x1x4xi32>
}
%unpack = tensor.unpack %21 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [3] inner_tiles = [4] into %extracted_slice_6 : tensor<1x1x4x1x4xi32> -> tensor<1x1x4x4xi32>
...
The IR just after the pass:
...
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
%21 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %20) -> (tensor<1x1x4x1x4xi32>) {
%22 = scf.for %arg12 = %c0 to %c3 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x4x1x4xi32>) {
%extracted_slice_11 = tensor.extract_slice %pack[0, %arg10, %arg12, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : tensor<1x3x6x1x4xi32> to tensor<1x1x4x1x4xi32>
%extracted_slice_12 = tensor.extract_slice %pack_9[%arg10, %arg12, 0, 0] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<3x3x1x4xi32> to tensor<1x1x1x4xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, 0, 0, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : tensor<1x1x4x1x4xi32> to tensor<1x4x4xi32>
%extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xi32> to tensor<4xi32>
%extracted_slice_15 = tensor.extract_slice %arg13[0, 0, 0, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : tensor<1x1x4x1x4xi32> to tensor<4x4xi32>
%23 = tensor.empty() : tensor<4x4xi32>
%extracted_slice_16 = tensor.extract_slice %extracted_slice_13[0, 0, 0] [1, 4, 4] [1, 1, 1] : tensor<1x4x4xi32> to tensor<4x4xi32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_16, %extracted_slice_14, %extracted_slice_15 : tensor<4x4xi32>, tensor<4xi32>, tensor<4x4xi32>) outs(%23 : tensor<4x4xi32>) {
^bb0(%in: i32, %in_17: i32, %in_18: i32, %out: i32):
%25 = arith.muli %in, %in_17 : i32
%26 = arith.addi %in_18, %25 : i32
linalg.yield %26 : i32
} -> tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %24 into %arg13[0, 0, 0, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : tensor<4x4xi32> into tensor<1x1x4x1x4xi32>
scf.yield %inserted_slice : tensor<1x1x4x1x4xi32>
}
...
The tensor.empty() after comprehensive bufferization in the new buffer.
@yzhang93 I agree that we will need to optimize this, the extra buffer isn't necessary. But that's sort of orthogonal to this PR -- this PR fixes a bug which needs fixing anyway.
@yzhang93 I agree that we will need to optimize this, the extra buffer isn't necessary. But that's sort of orthogonal to this PR -- this PR fixes a bug which needs fixing anyway.
Well, my viewpoint is if the IR is changed (without the alloc being used in the above way) it doesn't need to change codes inside DistributeCoresAndObjectFifos pass. Also even if you fixed it in this pass, you may encounter other errors in later pass.
Come to the generated IR itself, I think tensor.empty() should be able to eliminate, but looks like it doesn't? Can you share the full dump IR?
@yzhang93 I agree that we will need to optimize this, the extra buffer isn't necessary. But that's sort of orthogonal to this PR -- this PR fixes a bug which needs fixing anyway.
Well, my viewpoint is if the IR is changed (without the alloc being used in the above way) it doesn't need to change codes inside DistributeCoresAndObjectFifos pass. Also even if you fixed it in this pass, you may encounter other errors in later pass.
Come to the generated IR itself, I think tensor.empty() should be able to eliminate, but looks like it doesn't? Can you share the full dump IR?
Full trace: depthwise.txt
Well, my viewpoint is if the IR is changed (without the alloc being used in the above way) it doesn't need to change codes inside DistributeCoresAndObjectFifos pass.
Ok, but this PR fixes a bug, so whether or not we have this IR for depthwise convolution, this PR should land.
Thanks @yzhang93
I'd definitely like to get back to optimizing the IR so that there isn't an unnecessary allocation. But this unblocks my convolution work for now, and as the extra allocation is just on the depthwise convolution path (i.e. less important) I'm fine with an extra alloc for now.
Fix for a regression in depthwise convolution with packing, when we use the upstream pass to remove unit extent dimensions
useRankReducingSlices = true
. Enabling the unit extent dimension removal results in a op:appearing in the IR, which is new because it's the first time we see an op with one operand directly from a
memref.alloc
, and another from amemref.subview
operation. This PR adds a fix for this case (by simplifying the logic...). Before this PR, the assumption was that all/none of the operands were frommemref.alloc
(I think).