Closed dcaballe closed 9 months ago
I think we already have it for x86_64 and arm..
What is the compilation flags you're using? What is the target CPU?
@NatashaKnk, could you please help with this? If you use ToT, including your PR, and compile the i8 version of our internal model for pixel 8, you will see that DT is not triggering for some matmuls that should be supported, including: dispatch 45 (i8, i8, i32), dispatch 40 (f32), dispatch 38 (f32), etc., etc. There are a good bunch of them.
Please, note that the (f32, i8, f32) case is a different issue, tracked here: https://github.com/openxla/iree/issues/15760
What is the status of this?
I checked the example here with a simple function:
module {
func.func @matmul_test(%arg0: tensor<2048x256xi8>, %arg1: tensor<2048x128xi32>) -> tensor<2048x128xi32> {
%cst = arith.constant dense<3> : tensor<256x128xi8>
%0 = linalg.matmul ins(%arg0, %cst : tensor<2048x256xi8>, tensor<256x128xi8>) outs(%arg1 : tensor<2048x128xi32>) -> tensor<2048x128xi32>
return %0 : tensor<2048x128xi32>
}
}
Using the latest commit (65680c69ff1433129be76bf37b028521d30cc5ac) and compile with:
iree-compile --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-triple=aarch64-none-linux-android34 --iree-llvmcpu-target-cpu-features=+fp-armv8,+neon,+crc,+lse,+rdm,+ras,+rcpc,+dotprod,+v9a,+sb,+ssbs,+fullfp16,+fp16fml,+i8mm,+bf16,+flagm --mlir-disable-threading --iree-opt-data-tiling --iree-llvmcpu-enable-ukernels=all ./tmp.mlir -mlir-print-ir-after-all 2> tmp.dump
And I got
// -----// IR Dump After LLVMCPUSelectLoweringStrategy (iree-llvmcpu-select-lowering-strategy) //----- //
hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+fp-armv8,+neon,+crc,+lse,+rdm,+ras,+rcpc,+dotprod,+v9a,+sb,+ssbs,+fullfp16,+fp16fml,+i8mm,+bf16,+flagm,+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-unknown-unknown-eabi-elf", ukernels = "all"}>) {
hal.executable.export public @matmul_test_dispatch_2_mmt4d_256x16x32x8x8x8_i8xi8xi32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_test_dispatch_2_mmt4d_256x16x32x8x8x8_i8xi8xi32() {
%cst = arith.constant dense<3> : tensor<16x32x8x8xi8>
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x32x8x8xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c524288) : !flow.dispatch.tensor<readwrite:tensor<256x16x8x8xi32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [256, 32, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<256x32x8x8xi8>> -> tensor<256x32x8x8xi8>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [256, 16, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<256x16x8x8xi32>> -> tensor<256x16x8x8xi32>
%4 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 8, 0, 0, 0, 0], [1, 1, 0, 8, 8, 0], [0, 0, 1, 0, 0, 8]]>} ins(%2, %cst : tensor<256x32x8x8xi8>, tensor<16x32x8x8xi8>) outs(%3 : tensor<256x16x8x8xi32>) -> tensor<256x16x8x8xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [256, 16, 8, 8], strides = [1, 1, 1, 1] : tensor<256x16x8x8xi32> -> !flow.dispatch.tensor<readwrite:tensor<256x16x8x8xi32>>
return
}
}
}
...
// -----// IR Dump After CPULowerToUKernels (iree-codegen-cpu-lower-to-ukernels) //----- //
module {
func.func @matmul_test_dispatch_2_mmt4d_256x16x32x8x8x8_i8xi8xi32() {
%c1282_i32 = arith.constant 1282 : i32
%c8_i32 = arith.constant 8 : i32
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%cst = arith.constant dense<3> : tensor<8x32x8x8xi8>
%c16 = arith.constant 16 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x32x8x8xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c524288) : !flow.dispatch.tensor<readwrite:tensor<256x16x8x8xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c256 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c16 step %5 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0, 0, 0], sizes = [32, 32, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<256x32x8x8xi8>> -> tensor<32x32x8x8xi8>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1, 0, 0], sizes = [32, 8, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<256x16x8x8xi32>> -> tensor<32x8x8x8xi32>
%8 = iree_codegen.ukernel.generic "iree_uk_mmt4d" ins(%6, %cst : tensor<32x32x8x8xi8>, tensor<8x32x8x8xi8>) outs(%7 : tensor<32x8x8x8xi32>) (%c32, %c8, %c32, %c8_i32, %c8_i32, %c8_i32, %c1282_i32 : index, index, index, i32, i32, i32, i32) fn_def_attrs {hal.import.bitcode = true, hal.import.cconv = 1 : i32, hal.import.fields = ["processor_data"]} strided_outer_dims(1) -> tensor<32x8x8x8xi32>
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1, 0, 0], sizes = [32, 8, 8, 8], strides = [1, 1, 1, 1] : tensor<32x8x8x8xi32> -> !flow.dispatch.tensor<readwrite:tensor<256x16x8x8xi32>>
}
}
return
}
}
So looks like DT (and UK) is enabled for this case now
Can someone comment on the status of this issue? Seems fixed now. Can we please validate and close?
Benoit mentioned that we are not data tiling some matmuls depending on their types. I'm seeing the following one not data tiled:
Could we enable DT for this type combination?