[compile][cpu]:'func.func' op exceeded stack allocation limit of 32768 bytes for function

What happened?

For the given IR

module {
  func.func @torch_jit(%arg3: !torch.vtensor<[8,12,128,128],f32>, %arg4: !torch.vtensor<[8,12,128,64],f32>,%arg5:!torch.vtensor<[768,768],f32>,%arg6: !torch.vtensor<[3],si64>,%arg7:!torch.vtensor<[768],f32>) -> !torch.vtensor<[8,128,768],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "1.13.0"} {
    %258 = torch.operator "onnx.MatMul"(%arg3, %arg4) : (!torch.vtensor<[8,12,128,128],f32>, !torch.vtensor<[8,12,128,64],f32>) -> !torch.vtensor<[8,12,128,64],f32> 
    %259 = torch.operator "onnx.Transpose"(%258) {torch.onnx.perm = [0 : si64, 2 : si64, 1 : si64, 3 : si64]} : (!torch.vtensor<[8,12,128,64],f32>) -> !torch.vtensor<[8,128,12,64],f32> 
    %260 = torch.operator "onnx.Constant"() {torch.onnx.value = dense_resource<__16> : tensor<3xsi64>} : () -> !torch.vtensor<[3],si64> 
    %261 = torch.operator "onnx.Reshape"(%259, %arg6) {torch.onnx.allowzero = 0 : si64} : (!torch.vtensor<[8,128,12,64],f32>, !torch.vtensor<[3],si64>) -> !torch.vtensor<[8,128,768],f32> 
    %262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32> 
    %263 = torch.operator "onnx.Add"(%arg7, %262) : (!torch.vtensor<[768],f32>, !torch.vtensor<[8,128,768],f32>) -> !torch.vtensor<[8,128,768],f32> 
    return %263: !torch.vtensor<[8,128,768],f32>
  }
}

getting error as

model.torch_onnx.mlir:7:12: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 65664 bytes %262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32> ^ model.torch_onnx.mlir:7:12: note: see current operation:

IR after failure:

// -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversionPass Failed (iree-llvmcpu-check-ir-before-llvm-conversion) //----- //
func.func @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
  %cst = arith.constant dense<0.000000e+00> : vector<8xf32>
  %c7 = arith.constant 7 : index
  %c6 = arith.constant 6 : index
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c9437184 = arith.constant 9437184 : index
  %c16 = arith.constant 16 : index
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x8x4xf32>
  %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<8x1x32x64xf32>
  %0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c9437184) flags("ReadOnly|Indirect") : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
  memref.assume_alignment %0, 64 : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
  %1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<8x16x12x64x8x1xf32>
  memref.assume_alignment %1, 64 : memref<8x16x12x64x8x1xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
  scf.for %arg0 = %2 to %c16 step %3 {
    scf.for %arg1 = %workgroup_id_x to %c12 step %workgroup_count_x {
      %subview = memref.subview %1[0, %arg0, %arg1, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x16x12x64x8x1xf32> to memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
      %subview_1 = memref.subview %0[0, %arg1, %arg0, 0, 0, 0] [8, 1, 4, 16, 8, 4] [1, 1, 1, 1, 1, 1] : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>> to memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
      scf.for %arg2 = %c0 to %c8 step %c1 {
        scf.for %arg3 = %c0 to %c32 step %c1 {
          %4 = affine.apply affine_map<(d0) -> (d0 floordiv 8)>(%arg3)
          %5 = affine.apply affine_map<(d0) -> (d0 mod 8)>(%arg3)
          scf.for %arg4 = %c0 to %c64 step %c1 {
            %6 = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%arg4)
            %7 = affine.apply affine_map<(d0) -> (d0 mod 4)>(%arg4)
            %8 = vector.load %subview_1[%arg2, %c0, %4, %6, %c0, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %9 = vector.load %subview_1[%arg2, %c0, %4, %6, %c1, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %10 = vector.load %subview_1[%arg2, %c0, %4, %6, %c2, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %11 = vector.load %subview_1[%arg2, %c0, %4, %6, %c3, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %12 = vector.load %subview_1[%arg2, %c0, %4, %6, %c4, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %13 = vector.load %subview_1[%arg2, %c0, %4, %6, %c5, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %14 = vector.load %subview_1[%arg2, %c0, %4, %6, %c6, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %15 = vector.load %subview_1[%arg2, %c0, %4, %6, %c7, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
            %subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<8x4xf32>
            vector.store %8, %subview_2[%c0, %c0] : memref<8x4xf32>, vector<4xf32>
            vector.store %9, %subview_2[%c1, %c0] : memref<8x4xf32>, vector<4xf32>
            vector.store %10, %subview_2[%c2, %c0] : memref<8x4xf32>, vector<4xf32>
            vector.store %11, %subview_2[%c3, %c0] : memref<8x4xf32>, vector<4xf32>
            vector.store %12, %subview_2[%c4, %c0] : memref<8x4xf32>, vector<4xf32>
            vector.store %13, %subview_2[%c5, %c0] : memref<8x4xf32>, vector<4xf32>
            vector.store %14, %subview_2[%c6, %c0] : memref<8x4xf32>, vector<4xf32>
            vector.store %15, %subview_2[%c7, %c0] : memref<8x4xf32>, vector<4xf32>
            %subview_3 = memref.subview %alloca[0, 0, %5, %7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
            %subview_4 = memref.subview %alloca_0[%arg2, 0, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<8x1x32x64xf32> to memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
            %16 = memref.load %subview_3[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
            memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
          }
        }
      }
      scf.for %arg2 = %c0 to %c8 step %c1 {
        scf.for %arg3 = %c0 to %c4 step %c1 {
          scf.for %arg4 = %c0 to %c64 step %c1 {
            %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg3)
            %5 = memref.load %alloca_0[%arg2, %c0, %4, %arg4] : memref<8x1x32x64xf32>
            %6 = vector.broadcast %5 : f32 to vector<1xf32>
            %7 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg3)
            %8 = memref.load %alloca_0[%arg2, %c0, %7, %arg4] : memref<8x1x32x64xf32>
            %9 = vector.broadcast %8 : f32 to vector<1xf32>
            %10 = affine.apply affine_map<(d0) -> (d0 * 8 + 2)>(%arg3)
            %11 = memref.load %alloca_0[%arg2, %c0, %10, %arg4] : memref<8x1x32x64xf32>
            %12 = vector.broadcast %11 : f32 to vector<1xf32>
            %13 = affine.apply affine_map<(d0) -> (d0 * 8 + 3)>(%arg3)
            %14 = memref.load %alloca_0[%arg2, %c0, %13, %arg4] : memref<8x1x32x64xf32>
            %15 = vector.broadcast %14 : f32 to vector<1xf32>
            %16 = affine.apply affine_map<(d0) -> (d0 * 8 + 4)>(%arg3)
            %17 = memref.load %alloca_0[%arg2, %c0, %16, %arg4] : memref<8x1x32x64xf32>
            %18 = vector.broadcast %17 : f32 to vector<1xf32>
            %19 = affine.apply affine_map<(d0) -> (d0 * 8 + 5)>(%arg3)
            %20 = memref.load %alloca_0[%arg2, %c0, %19, %arg4] : memref<8x1x32x64xf32>
            %21 = vector.broadcast %20 : f32 to vector<1xf32>
            %22 = affine.apply affine_map<(d0) -> (d0 * 8 + 6)>(%arg3)
            %23 = memref.load %alloca_0[%arg2, %c0, %22, %arg4] : memref<8x1x32x64xf32>
            %24 = vector.broadcast %23 : f32 to vector<1xf32>
            %25 = affine.apply affine_map<(d0) -> (d0 * 8 + 7)>(%arg3)
            %26 = memref.load %alloca_0[%arg2, %c0, %25, %arg4] : memref<8x1x32x64xf32>
            %27 = vector.broadcast %26 : f32 to vector<1xf32>
            %28 = vector.insert_strided_slice %6, %cst {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
            %29 = vector.insert_strided_slice %9, %28 {offsets = [1], strides = [1]} : vector<1xf32> into vector<8xf32>
            %30 = vector.insert_strided_slice %12, %29 {offsets = [2], strides = [1]} : vector<1xf32> into vector<8xf32>
            %31 = vector.insert_strided_slice %15, %30 {offsets = [3], strides = [1]} : vector<1xf32> into vector<8xf32>
            %32 = vector.insert_strided_slice %18, %31 {offsets = [4], strides = [1]} : vector<1xf32> into vector<8xf32>
            %33 = vector.insert_strided_slice %21, %32 {offsets = [5], strides = [1]} : vector<1xf32> into vector<8xf32>
            %34 = vector.insert_strided_slice %24, %33 {offsets = [6], strides = [1]} : vector<1xf32> into vector<8xf32>
            %35 = vector.insert_strided_slice %27, %34 {offsets = [7], strides = [1]} : vector<1xf32> into vector<8xf32>
            %subview_2 = memref.subview %subview[0, 0, 0, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>> to memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
            vector.store %35, %subview_2[%arg2, %arg3, %c0, %arg4, %c0] : memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, vector<8xf32>
          }
        }
      }
    }
  }
  return
}

// -----// IR Dump After TranslateTargetExecutableVariantsPass Failed (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
  ^bb0(%arg0: !hal.device):
    %c12 = arith.constant 12 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    hal.return %c12, %c4, %c1 : index, index, index
  }
  builtin.module {
    func.func @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
      %cst = arith.constant dense<0.000000e+00> : vector<8xf32>
      %c7 = arith.constant 7 : index
      %c6 = arith.constant 6 : index
      %c5 = arith.constant 5 : index
      %c3 = arith.constant 3 : index
      %c2 = arith.constant 2 : index
      %c0 = arith.constant 0 : index
      %c9437184 = arith.constant 9437184 : index
      %c16 = arith.constant 16 : index
      %c12 = arith.constant 12 : index
      %c8 = arith.constant 8 : index
      %c1 = arith.constant 1 : index
      %c32 = arith.constant 32 : index
      %c64 = arith.constant 64 : index
      %c4 = arith.constant 4 : index
      %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x8x4xf32>
      %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<8x1x32x64xf32>
      %0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c9437184) flags("ReadOnly|Indirect") : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
      memref.assume_alignment %0, 64 : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
      %1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<8x16x12x64x8x1xf32>
      memref.assume_alignment %1, 64 : memref<8x16x12x64x8x1xf32>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_count_x = hal.interface.workgroup.count[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_count_y = hal.interface.workgroup.count[1] : index
      %2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
      scf.for %arg0 = %2 to %c16 step %3 {
        scf.for %arg1 = %workgroup_id_x to %c12 step %workgroup_count_x {
          %subview = memref.subview %1[0, %arg0, %arg1, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x16x12x64x8x1xf32> to memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
          %subview_1 = memref.subview %0[0, %arg1, %arg0, 0, 0, 0] [8, 1, 4, 16, 8, 4] [1, 1, 1, 1, 1, 1] : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>> to memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
          scf.for %arg2 = %c0 to %c8 step %c1 {
            scf.for %arg3 = %c0 to %c32 step %c1 {
              %4 = affine.apply affine_map<(d0) -> (d0 floordiv 8)>(%arg3)
              %5 = affine.apply affine_map<(d0) -> (d0 mod 8)>(%arg3)
              scf.for %arg4 = %c0 to %c64 step %c1 {
                %6 = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%arg4)
                %7 = affine.apply affine_map<(d0) -> (d0 mod 4)>(%arg4)
                %8 = vector.load %subview_1[%arg2, %c0, %4, %6, %c0, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %9 = vector.load %subview_1[%arg2, %c0, %4, %6, %c1, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %10 = vector.load %subview_1[%arg2, %c0, %4, %6, %c2, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %11 = vector.load %subview_1[%arg2, %c0, %4, %6, %c3, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %12 = vector.load %subview_1[%arg2, %c0, %4, %6, %c4, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %13 = vector.load %subview_1[%arg2, %c0, %4, %6, %c5, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %14 = vector.load %subview_1[%arg2, %c0, %4, %6, %c6, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %15 = vector.load %subview_1[%arg2, %c0, %4, %6, %c7, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                %subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<8x4xf32>
                vector.store %8, %subview_2[%c0, %c0] : memref<8x4xf32>, vector<4xf32>
                vector.store %9, %subview_2[%c1, %c0] : memref<8x4xf32>, vector<4xf32>
                vector.store %10, %subview_2[%c2, %c0] : memref<8x4xf32>, vector<4xf32>
                vector.store %11, %subview_2[%c3, %c0] : memref<8x4xf32>, vector<4xf32>
                vector.store %12, %subview_2[%c4, %c0] : memref<8x4xf32>, vector<4xf32>
                vector.store %13, %subview_2[%c5, %c0] : memref<8x4xf32>, vector<4xf32>
                vector.store %14, %subview_2[%c6, %c0] : memref<8x4xf32>, vector<4xf32>
                vector.store %15, %subview_2[%c7, %c0] : memref<8x4xf32>, vector<4xf32>
                %subview_3 = memref.subview %alloca[0, 0, %5, %7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
                %subview_4 = memref.subview %alloca_0[%arg2, 0, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<8x1x32x64xf32> to memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
                %16 = memref.load %subview_3[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
                memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
              }
            }
          }
          scf.for %arg2 = %c0 to %c8 step %c1 {
            scf.for %arg3 = %c0 to %c4 step %c1 {
              scf.for %arg4 = %c0 to %c64 step %c1 {
                %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg3)
                %5 = memref.load %alloca_0[%arg2, %c0, %4, %arg4] : memref<8x1x32x64xf32>
                %6 = vector.broadcast %5 : f32 to vector<1xf32>
                %7 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg3)
                %8 = memref.load %alloca_0[%arg2, %c0, %7, %arg4] : memref<8x1x32x64xf32>
                %9 = vector.broadcast %8 : f32 to vector<1xf32>
                %10 = affine.apply affine_map<(d0) -> (d0 * 8 + 2)>(%arg3)
                %11 = memref.load %alloca_0[%arg2, %c0, %10, %arg4] : memref<8x1x32x64xf32>
                %12 = vector.broadcast %11 : f32 to vector<1xf32>
                %13 = affine.apply affine_map<(d0) -> (d0 * 8 + 3)>(%arg3)
                %14 = memref.load %alloca_0[%arg2, %c0, %13, %arg4] : memref<8x1x32x64xf32>
                %15 = vector.broadcast %14 : f32 to vector<1xf32>
                %16 = affine.apply affine_map<(d0) -> (d0 * 8 + 4)>(%arg3)
                %17 = memref.load %alloca_0[%arg2, %c0, %16, %arg4] : memref<8x1x32x64xf32>
                %18 = vector.broadcast %17 : f32 to vector<1xf32>
                %19 = affine.apply affine_map<(d0) -> (d0 * 8 + 5)>(%arg3)
                %20 = memref.load %alloca_0[%arg2, %c0, %19, %arg4] : memref<8x1x32x64xf32>
                %21 = vector.broadcast %20 : f32 to vector<1xf32>
                %22 = affine.apply affine_map<(d0) -> (d0 * 8 + 6)>(%arg3)
                %23 = memref.load %alloca_0[%arg2, %c0, %22, %arg4] : memref<8x1x32x64xf32>
                %24 = vector.broadcast %23 : f32 to vector<1xf32>
                %25 = affine.apply affine_map<(d0) -> (d0 * 8 + 7)>(%arg3)
                %26 = memref.load %alloca_0[%arg2, %c0, %25, %arg4] : memref<8x1x32x64xf32>
                %27 = vector.broadcast %26 : f32 to vector<1xf32>
                %28 = vector.insert_strided_slice %6, %cst {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
                %29 = vector.insert_strided_slice %9, %28 {offsets = [1], strides = [1]} : vector<1xf32> into vector<8xf32>
                %30 = vector.insert_strided_slice %12, %29 {offsets = [2], strides = [1]} : vector<1xf32> into vector<8xf32>
                %31 = vector.insert_strided_slice %15, %30 {offsets = [3], strides = [1]} : vector<1xf32> into vector<8xf32>
                %32 = vector.insert_strided_slice %18, %31 {offsets = [4], strides = [1]} : vector<1xf32> into vector<8xf32>
                %33 = vector.insert_strided_slice %21, %32 {offsets = [5], strides = [1]} : vector<1xf32> into vector<8xf32>
                %34 = vector.insert_strided_slice %24, %33 {offsets = [6], strides = [1]} : vector<1xf32> into vector<8xf32>
                %35 = vector.insert_strided_slice %27, %34 {offsets = [7], strides = [1]} : vector<1xf32> into vector<8xf32>
                %subview_2 = memref.subview %subview[0, 0, 0, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>> to memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
                vector.store %35, %subview_2[%arg2, %arg3, %c0, %arg4, %c0] : memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, vector<8xf32>
              }
            }
          }
        }
      }
      return
    }
  }
}

failed to translate executables
// -----// IR Dump After TranslateExecutablesPass Failed (iree-hal-translate-executables) //----- //
hal.executable private @torch_jit$async_dispatch_3 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
    ^bb0(%arg0: !hal.device):
      %c12 = arith.constant 12 : index
      %c4 = arith.constant 4 : index
      %c1 = arith.constant 1 : index
      hal.return %c12, %c4, %c1 : index, index, index
    }
    builtin.module {
      func.func @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
        %cst = arith.constant dense<0.000000e+00> : vector<8xf32>
        %c7 = arith.constant 7 : index
        %c6 = arith.constant 6 : index
        %c5 = arith.constant 5 : index
        %c3 = arith.constant 3 : index
        %c2 = arith.constant 2 : index
        %c0 = arith.constant 0 : index
        %c9437184 = arith.constant 9437184 : index
        %c16 = arith.constant 16 : index
        %c12 = arith.constant 12 : index
        %c8 = arith.constant 8 : index
        %c1 = arith.constant 1 : index
        %c32 = arith.constant 32 : index
        %c64 = arith.constant 64 : index
        %c4 = arith.constant 4 : index
        %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x8x4xf32>
        %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<8x1x32x64xf32>
        %0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c9437184) flags("ReadOnly|Indirect") : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
        memref.assume_alignment %0, 64 : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
        %1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<8x16x12x64x8x1xf32>
        memref.assume_alignment %1, 64 : memref<8x16x12x64x8x1xf32>
        %workgroup_id_x = hal.interface.workgroup.id[0] : index
        %workgroup_count_x = hal.interface.workgroup.count[0] : index
        %workgroup_id_y = hal.interface.workgroup.id[1] : index
        %workgroup_count_y = hal.interface.workgroup.count[1] : index
        %2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
        %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
        scf.for %arg0 = %2 to %c16 step %3 {
          scf.for %arg1 = %workgroup_id_x to %c12 step %workgroup_count_x {
            %subview = memref.subview %1[0, %arg0, %arg1, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x16x12x64x8x1xf32> to memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
            %subview_1 = memref.subview %0[0, %arg1, %arg0, 0, 0, 0] [8, 1, 4, 16, 8, 4] [1, 1, 1, 1, 1, 1] : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>> to memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
            scf.for %arg2 = %c0 to %c8 step %c1 {
              scf.for %arg3 = %c0 to %c32 step %c1 {
                %4 = affine.apply affine_map<(d0) -> (d0 floordiv 8)>(%arg3)
                %5 = affine.apply affine_map<(d0) -> (d0 mod 8)>(%arg3)
                scf.for %arg4 = %c0 to %c64 step %c1 {
                  %6 = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%arg4)
                  %7 = affine.apply affine_map<(d0) -> (d0 mod 4)>(%arg4)
                  %8 = vector.load %subview_1[%arg2, %c0, %4, %6, %c0, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %9 = vector.load %subview_1[%arg2, %c0, %4, %6, %c1, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %10 = vector.load %subview_1[%arg2, %c0, %4, %6, %c2, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %11 = vector.load %subview_1[%arg2, %c0, %4, %6, %c3, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %12 = vector.load %subview_1[%arg2, %c0, %4, %6, %c4, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %13 = vector.load %subview_1[%arg2, %c0, %4, %6, %c5, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %14 = vector.load %subview_1[%arg2, %c0, %4, %6, %c6, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %15 = vector.load %subview_1[%arg2, %c0, %4, %6, %c7, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
                  %subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<8x4xf32>
                  vector.store %8, %subview_2[%c0, %c0] : memref<8x4xf32>, vector<4xf32>
                  vector.store %9, %subview_2[%c1, %c0] : memref<8x4xf32>, vector<4xf32>
                  vector.store %10, %subview_2[%c2, %c0] : memref<8x4xf32>, vector<4xf32>
                  vector.store %11, %subview_2[%c3, %c0] : memref<8x4xf32>, vector<4xf32>
                  vector.store %12, %subview_2[%c4, %c0] : memref<8x4xf32>, vector<4xf32>
                  vector.store %13, %subview_2[%c5, %c0] : memref<8x4xf32>, vector<4xf32>
                  vector.store %14, %subview_2[%c6, %c0] : memref<8x4xf32>, vector<4xf32>
                  vector.store %15, %subview_2[%c7, %c0] : memref<8x4xf32>, vector<4xf32>
                  %subview_3 = memref.subview %alloca[0, 0, %5, %7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
                  %subview_4 = memref.subview %alloca_0[%arg2, 0, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<8x1x32x64xf32> to memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
                  %16 = memref.load %subview_3[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
                  memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
                }
              }
            }
            scf.for %arg2 = %c0 to %c8 step %c1 {
              scf.for %arg3 = %c0 to %c4 step %c1 {
                scf.for %arg4 = %c0 to %c64 step %c1 {
                  %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg3)
                  %5 = memref.load %alloca_0[%arg2, %c0, %4, %arg4] : memref<8x1x32x64xf32>
                  %6 = vector.broadcast %5 : f32 to vector<1xf32>
                  %7 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg3)
                  %8 = memref.load %alloca_0[%arg2, %c0, %7, %arg4] : memref<8x1x32x64xf32>
                  %9 = vector.broadcast %8 : f32 to vector<1xf32>
                  %10 = affine.apply affine_map<(d0) -> (d0 * 8 + 2)>(%arg3)
                  %11 = memref.load %alloca_0[%arg2, %c0, %10, %arg4] : memref<8x1x32x64xf32>
                  %12 = vector.broadcast %11 : f32 to vector<1xf32>
                  %13 = affine.apply affine_map<(d0) -> (d0 * 8 + 3)>(%arg3)
                  %14 = memref.load %alloca_0[%arg2, %c0, %13, %arg4] : memref<8x1x32x64xf32>
                  %15 = vector.broadcast %14 : f32 to vector<1xf32>
                  %16 = affine.apply affine_map<(d0) -> (d0 * 8 + 4)>(%arg3)
                  %17 = memref.load %alloca_0[%arg2, %c0, %16, %arg4] : memref<8x1x32x64xf32>
                  %18 = vector.broadcast %17 : f32 to vector<1xf32>
                  %19 = affine.apply affine_map<(d0) -> (d0 * 8 + 5)>(%arg3)
                  %20 = memref.load %alloca_0[%arg2, %c0, %19, %arg4] : memref<8x1x32x64xf32>
                  %21 = vector.broadcast %20 : f32 to vector<1xf32>
                  %22 = affine.apply affine_map<(d0) -> (d0 * 8 + 6)>(%arg3)
                  %23 = memref.load %alloca_0[%arg2, %c0, %22, %arg4] : memref<8x1x32x64xf32>
                  %24 = vector.broadcast %23 : f32 to vector<1xf32>
                  %25 = affine.apply affine_map<(d0) -> (d0 * 8 + 7)>(%arg3)
                  %26 = memref.load %alloca_0[%arg2, %c0, %25, %arg4] : memref<8x1x32x64xf32>
                  %27 = vector.broadcast %26 : f32 to vector<1xf32>
                  %28 = vector.insert_strided_slice %6, %cst {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %29 = vector.insert_strided_slice %9, %28 {offsets = [1], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %30 = vector.insert_strided_slice %12, %29 {offsets = [2], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %31 = vector.insert_strided_slice %15, %30 {offsets = [3], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %32 = vector.insert_strided_slice %18, %31 {offsets = [4], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %33 = vector.insert_strided_slice %21, %32 {offsets = [5], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %34 = vector.insert_strided_slice %24, %33 {offsets = [6], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %35 = vector.insert_strided_slice %27, %34 {offsets = [7], strides = [1]} : vector<1xf32> into vector<8xf32>
                  %subview_2 = memref.subview %subview[0, 0, 0, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>> to memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
                  vector.store %35, %subview_2[%arg2, %arg3, %c0, %arg4, %c0] : memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, vector<8xf32>
                }
              }
            }
          }
        }
        return
      }
    }
  }
}

model.torch_onnx.mlir:7:12: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 65664 bytes
    %262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32> 
           ^
model.torch_onnx.mlir:7:12: note: see current operation: 
"func.func"() <{function_type = () -> (), sym_name = "torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack"}> ({
  %0 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8xf32>}> : () -> vector<8xf32>
  %1 = "arith.constant"() <{value = 7 : index}> : () -> index
  %2 = "arith.constant"() <{value = 6 : index}> : () -> index
  %3 = "arith.constant"() <{value = 5 : index}> : () -> index
  %4 = "arith.constant"() <{value = 3 : index}> : () -> index
  %5 = "arith.constant"() <{value = 2 : index}> : () -> index
  %6 = "arith.constant"() <{value = 0 : index}> : () -> index
  %7 = "arith.constant"() <{value = 9437184 : index}> : () -> index
  %8 = "arith.constant"() <{value = 16 : index}> : () -> index
  %9 = "arith.constant"() <{value = 12 : index}> : () -> index
  %10 = "arith.constant"() <{value = 8 : index}> : () -> index
  %11 = "arith.constant"() <{value = 1 : index}> : () -> index
  %12 = "arith.constant"() <{value = 32 : index}> : () -> index
  %13 = "arith.constant"() <{value = 64 : index}> : () -> index
  %14 = "arith.constant"() <{value = 4 : index}> : () -> index
  %15 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4xf32>
  %16 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<8x1x32x64xf32>
  %17 = "hal.interface.binding.subspan"(%7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
  "memref.assume_alignment"(%17) <{alignment = 64 : i32}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>) -> ()
  %18 = "hal.interface.binding.subspan"(%6) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x16x12x64x8x1xf32>
  "memref.assume_alignment"(%18) <{alignment = 64 : i32}> : (memref<8x16x12x64x8x1xf32>) -> ()
  %19 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
  %20 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
  %21 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
  %22 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
  %23 = "affine.apply"(%21) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
  %24 = "affine.apply"(%22) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
  "scf.for"(%23, %8, %24) ({
  ^bb0(%arg0: index):
    "scf.for"(%19, %9, %20) ({
    ^bb0(%arg1: index):
      %25 = "memref.subview"(%18, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x16x12x64x8x1xf32>, index, index) -> memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
      %26 = "memref.subview"(%17, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 1, 4, 16, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>, index, index) -> memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
      "scf.for"(%6, %10, %11) ({
      ^bb0(%arg5: index):
        "scf.for"(%6, %12, %11) ({
        ^bb0(%arg6: index):
          %60 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 floordiv 8)>}> : (index) -> index
          %61 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 mod 8)>}> : (index) -> index
          "scf.for"(%6, %13, %11) ({
          ^bb0(%arg7: index):
            %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 floordiv 4)>}> : (index) -> index
            %63 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 mod 4)>}> : (index) -> index
            %64 = "vector.load"(%26, %arg5, %6, %60, %62, %6, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %65 = "vector.load"(%26, %arg5, %6, %60, %62, %11, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %66 = "vector.load"(%26, %arg5, %6, %60, %62, %5, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %67 = "vector.load"(%26, %arg5, %6, %60, %62, %4, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %68 = "vector.load"(%26, %arg5, %6, %60, %62, %14, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %69 = "vector.load"(%26, %arg5, %6, %60, %62, %3, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %70 = "vector.load"(%26, %arg5, %6, %60, %62, %2, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %71 = "vector.load"(%26, %arg5, %6, %60, %62, %1, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
            %72 = "memref.subview"(%15) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>) -> memref<8x4xf32>
            "vector.store"(%64, %72, %6, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            "vector.store"(%65, %72, %11, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            "vector.store"(%66, %72, %5, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            "vector.store"(%67, %72, %4, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            "vector.store"(%68, %72, %14, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            "vector.store"(%69, %72, %3, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            "vector.store"(%70, %72, %2, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            "vector.store"(%71, %72, %1, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
            %73 = "memref.subview"(%15, %61, %63) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>, index, index) -> memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
            %74 = "memref.subview"(%16, %arg5, %arg6, %arg7) <{operandSegmentSizes = array<i32: 1, 3, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<8x1x32x64xf32>, index, index, index) -> memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
            %75 = "memref.load"(%73, %6, %6, %6, %6) <{nontemporal = false}> : (memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>, index, index, index, index) -> f32
            "memref.store"(%75, %74, %6, %6, %6, %6) <{nontemporal = false}> : (f32, memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>, index, index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%6, %10, %11) ({
      ^bb0(%arg2: index):
        "scf.for"(%6, %14, %11) ({
        ^bb0(%arg3: index):
          "scf.for"(%6, %13, %11) ({
          ^bb0(%arg4: index):
            %27 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8)>}> : (index) -> index
            %28 = "memref.load"(%16, %arg2, %6, %27, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %29 = "vector.broadcast"(%28) : (f32) -> vector<1xf32>
            %30 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 1)>}> : (index) -> index
            %31 = "memref.load"(%16, %arg2, %6, %30, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %32 = "vector.broadcast"(%31) : (f32) -> vector<1xf32>
            %33 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 2)>}> : (index) -> index
            %34 = "memref.load"(%16, %arg2, %6, %33, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %35 = "vector.broadcast"(%34) : (f32) -> vector<1xf32>
            %36 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 3)>}> : (index) -> index
            %37 = "memref.load"(%16, %arg2, %6, %36, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %38 = "vector.broadcast"(%37) : (f32) -> vector<1xf32>
            %39 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 4)>}> : (index) -> index
            %40 = "memref.load"(%16, %arg2, %6, %39, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %41 = "vector.broadcast"(%40) : (f32) -> vector<1xf32>
            %42 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 5)>}> : (index) -> index
            %43 = "memref.load"(%16, %arg2, %6, %42, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %44 = "vector.broadcast"(%43) : (f32) -> vector<1xf32>
            %45 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 6)>}> : (index) -> index
            %46 = "memref.load"(%16, %arg2, %6, %45, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %47 = "vector.broadcast"(%46) : (f32) -> vector<1xf32>
            %48 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 7)>}> : (index) -> index
            %49 = "memref.load"(%16, %arg2, %6, %48, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
            %50 = "vector.broadcast"(%49) : (f32) -> vector<1xf32>
            %51 = "vector.insert_strided_slice"(%29, %0) <{offsets = [0], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %52 = "vector.insert_strided_slice"(%32, %51) <{offsets = [1], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %53 = "vector.insert_strided_slice"(%35, %52) <{offsets = [2], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %54 = "vector.insert_strided_slice"(%38, %53) <{offsets = [3], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %55 = "vector.insert_strided_slice"(%41, %54) <{offsets = [4], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %56 = "vector.insert_strided_slice"(%44, %55) <{offsets = [5], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %57 = "vector.insert_strided_slice"(%47, %56) <{offsets = [6], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %58 = "vector.insert_strided_slice"(%50, %57) <{offsets = [7], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
            %59 = "memref.subview"(%25) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>) -> memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
            "vector.store"(%58, %59, %arg2, %arg3, %6, %arg4, %6) <{nontemporal = false}> : (vector<8xf32>, memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, index, index, index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<CPUDataTiling>} : () -> ()
model.torch_onnx.mlir:7:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32> 
           ^
model.torch_onnx.mlir:7:12: note: see current operation: 
"hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg8: !hal.device):
    %76 = "arith.constant"() <{value = 12 : index}> : () -> index
    %77 = "arith.constant"() <{value = 4 : index}> : () -> index
    %78 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%76, %77, %78) : (index, index, index) -> ()
  }) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, ordinal = 0 : index, sym_name = "torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack"}> ({
      %0 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8xf32>}> : () -> vector<8xf32>
      %1 = "arith.constant"() <{value = 7 : index}> : () -> index
      %2 = "arith.constant"() <{value = 6 : index}> : () -> index
      %3 = "arith.constant"() <{value = 5 : index}> : () -> index
      %4 = "arith.constant"() <{value = 3 : index}> : () -> index
      %5 = "arith.constant"() <{value = 2 : index}> : () -> index
      %6 = "arith.constant"() <{value = 0 : index}> : () -> index
      %7 = "arith.constant"() <{value = 9437184 : index}> : () -> index
      %8 = "arith.constant"() <{value = 16 : index}> : () -> index
      %9 = "arith.constant"() <{value = 12 : index}> : () -> index
      %10 = "arith.constant"() <{value = 8 : index}> : () -> index
      %11 = "arith.constant"() <{value = 1 : index}> : () -> index
      %12 = "arith.constant"() <{value = 32 : index}> : () -> index
      %13 = "arith.constant"() <{value = 64 : index}> : () -> index
      %14 = "arith.constant"() <{value = 4 : index}> : () -> index
      %15 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4xf32>
      %16 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<8x1x32x64xf32>
      %17 = "hal.interface.binding.subspan"(%7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
      "memref.assume_alignment"(%17) <{alignment = 64 : i32}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>) -> ()
      %18 = "hal.interface.binding.subspan"(%6) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x16x12x64x8x1xf32>
      "memref.assume_alignment"(%18) <{alignment = 64 : i32}> : (memref<8x16x12x64x8x1xf32>) -> ()
      %19 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %20 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
      %21 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %22 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
      %23 = "affine.apply"(%21) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
      %24 = "affine.apply"(%22) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
      "scf.for"(%23, %8, %24) ({
      ^bb0(%arg0: index):
        "scf.for"(%19, %9, %20) ({
        ^bb0(%arg1: index):
          %25 = "memref.subview"(%18, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x16x12x64x8x1xf32>, index, index) -> memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
          %26 = "memref.subview"(%17, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 1, 4, 16, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>, index, index) -> memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
          "scf.for"(%6, %10, %11) ({
          ^bb0(%arg5: index):
            "scf.for"(%6, %12, %11) ({
            ^bb0(%arg6: index):
              %60 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 floordiv 8)>}> : (index) -> index
              %61 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 mod 8)>}> : (index) -> index
              "scf.for"(%6, %13, %11) ({
              ^bb0(%arg7: index):
                %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 floordiv 4)>}> : (index) -> index
                %63 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 mod 4)>}> : (index) -> index
                %64 = "vector.load"(%26, %arg5, %6, %60, %62, %6, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %65 = "vector.load"(%26, %arg5, %6, %60, %62, %11, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %66 = "vector.load"(%26, %arg5, %6, %60, %62, %5, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %67 = "vector.load"(%26, %arg5, %6, %60, %62, %4, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %68 = "vector.load"(%26, %arg5, %6, %60, %62, %14, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %69 = "vector.load"(%26, %arg5, %6, %60, %62, %3, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %70 = "vector.load"(%26, %arg5, %6, %60, %62, %2, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %71 = "vector.load"(%26, %arg5, %6, %60, %62, %1, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
                %72 = "memref.subview"(%15) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>) -> memref<8x4xf32>
                "vector.store"(%64, %72, %6, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                "vector.store"(%65, %72, %11, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                "vector.store"(%66, %72, %5, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                "vector.store"(%67, %72, %4, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                "vector.store"(%68, %72, %14, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                "vector.store"(%69, %72, %3, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                "vector.store"(%70, %72, %2, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                "vector.store"(%71, %72, %1, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
                %73 = "memref.subview"(%15, %61, %63) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>, index, index) -> memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
                %74 = "memref.subview"(%16, %arg5, %arg6, %arg7) <{operandSegmentSizes = array<i32: 1, 3, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<8x1x32x64xf32>, index, index, index) -> memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
                %75 = "memref.load"(%73, %6, %6, %6, %6) <{nontemporal = false}> : (memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>, index, index, index, index) -> f32
                "memref.store"(%75, %74, %6, %6, %6, %6) <{nontemporal = false}> : (f32, memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>, index, index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.for"(%6, %10, %11) ({
          ^bb0(%arg2: index):
            "scf.for"(%6, %14, %11) ({
            ^bb0(%arg3: index):
              "scf.for"(%6, %13, %11) ({
              ^bb0(%arg4: index):
                %27 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8)>}> : (index) -> index
                %28 = "memref.load"(%16, %arg2, %6, %27, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %29 = "vector.broadcast"(%28) : (f32) -> vector<1xf32>
                %30 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 1)>}> : (index) -> index
                %31 = "memref.load"(%16, %arg2, %6, %30, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %32 = "vector.broadcast"(%31) : (f32) -> vector<1xf32>
                %33 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 2)>}> : (index) -> index
                %34 = "memref.load"(%16, %arg2, %6, %33, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %35 = "vector.broadcast"(%34) : (f32) -> vector<1xf32>
                %36 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 3)>}> : (index) -> index
                %37 = "memref.load"(%16, %arg2, %6, %36, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %38 = "vector.broadcast"(%37) : (f32) -> vector<1xf32>
                %39 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 4)>}> : (index) -> index
                %40 = "memref.load"(%16, %arg2, %6, %39, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %41 = "vector.broadcast"(%40) : (f32) -> vector<1xf32>
                %42 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 5)>}> : (index) -> index
                %43 = "memref.load"(%16, %arg2, %6, %42, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %44 = "vector.broadcast"(%43) : (f32) -> vector<1xf32>
                %45 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 6)>}> : (index) -> index
                %46 = "memref.load"(%16, %arg2, %6, %45, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %47 = "vector.broadcast"(%46) : (f32) -> vector<1xf32>
                %48 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 7)>}> : (index) -> index
                %49 = "memref.load"(%16, %arg2, %6, %48, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
                %50 = "vector.broadcast"(%49) : (f32) -> vector<1xf32>
                %51 = "vector.insert_strided_slice"(%29, %0) <{offsets = [0], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %52 = "vector.insert_strided_slice"(%32, %51) <{offsets = [1], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %53 = "vector.insert_strided_slice"(%35, %52) <{offsets = [2], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %54 = "vector.insert_strided_slice"(%38, %53) <{offsets = [3], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %55 = "vector.insert_strided_slice"(%41, %54) <{offsets = [4], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %56 = "vector.insert_strided_slice"(%44, %55) <{offsets = [5], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %57 = "vector.insert_strided_slice"(%47, %56) <{offsets = [6], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %58 = "vector.insert_strided_slice"(%50, %57) <{offsets = [7], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
                %59 = "memref.subview"(%25) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>) -> memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
                "vector.store"(%58, %59, %arg2, %arg3, %6, %arg4, %6) <{nontemporal = false}> : (vector<8xf32>, memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, index, index, index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<CPUDataTiling>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x87_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()

Steps to reproduce your issue

Command to reproduce:

iree-compile model.torch_onnx.mlir --iree-hal-target-backends=llvm-cpu --iree-input-demote-i64-to-i32

IREE version: IREE compiler version 20240819.990 @ aeda14995f16ed1302db616adf0c03acf80f27ee LLVM version 20.0.0git

What component(s) does this issue relate to?

Compiler

Version information

No response

Additional context

No response

iree-org / iree