(HIP) Lowering issues with SDPA when compiling for gfx1100.

What happened?

I am experiencing issues trying to lower the linalg_ext.attention op for gfx1100 target, notably on SDXL unet. (Radeon 7900xtx)

The failing attention dispatch is at index 78 (save as dispatch_78.mlir for repro):

hal.executable public @main$async_dispatch_78 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100", ukernels = "none"}>) {
    hal.executable.export public @main$async_dispatch_78_attention_20x4096x64xf16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main$async_dispatch_78_attention_20x4096x64xf16() {
        %cst = arith.constant 1.250000e-01 : f16
        %0 = hal.interface.constant.load[0] : i32
        %1 = hal.interface.constant.load[1] : i32
        %2 = hal.interface.constant.load[2] : i32
        %3 = hal.interface.constant.load[3] : i32
        %4 = arith.index_castui %0 : i32 to index
        %5 = arith.index_castui %1 : i32 to index
        %6 = arith.index_castui %2 : i32 to index
        %7 = arith.index_castui %3 : i32 to index
        %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
        %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
        %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
        %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
        %12 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
        %13 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
        %14 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
        %15 = tensor.empty() : tensor<20x4096x64xf16>
        %16 = iree_linalg_ext.attention ins(%12, %13, %14, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%15 : tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16>
        flow.dispatch.tensor.store %16, %11, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
        return
      }
    }
  }
}

The error:

iree.compiler.tools.binaries.CompilerToolError: Error invoking IREE compiler tool iree-compile.exe
Error code: 1
Diagnostics:
failed to translate executables
./unetdps\configured_compiled_unet_main$async_dispatch_78.mlir:9:6: error: Failures have been detected while processing an MLIR pass pipeline
      func.func @main$async_dispatch_78_attention_20x4096x64xf16() {
     ^
./unetdps\configured_compiled_unet_main$async_dispatch_78.mlir:9:6: note: Pipeline failed while executing [`TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_50, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_56, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_63, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_64, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_68, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_69, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_70, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_71, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_72, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `LLVMGPULowerExecutableTarget` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_75, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_78, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `ConvertToROCDL` on 'builtin.module' operation, `LLVMGPUVectorLowering` on 'func.func' operation: @main$async_dispatch_64_conv_2d_nchw_fchw_2x640x64x64x320x1x1_f16xf16xf32, `TranslateExecutablesPass` on 'hal.executable' operation: @main$async_dispatch_79, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `EmulateNarrowType` on 'builtin.module' operation, `GPUDistributeSharedMemoryCopy` on 'func.func' operation: @main$async_dispatch_72_matmul_transpose_b_8192x640x640_f16xf16xf32, `LLVMGPUVectorLowering` on 'func.func' operation: @main$async_dispatch_70_generic_8192x640_f32, `ConvertBf16ArithToF32` on 'builtin.module' operation, `LinalgExtToLoops` on 'func.func' operation: @main$async_dispatch_78_attention_20x4096x64xf16, `Canonicalizer` on 'builtin.module' operation, `PolynomialApproximationPass` on 'func.func' operation: @main$async_dispatch_71_generic_8192x640_f16xf32xf32xf16xf16xf16, `FoldTensorExtractOp` on 'builtin.module' operation]: reproducer generated at `./shark_tmp/core-reproducer.mlir`
./unetdps\configured_compiled_unet_main$async_dispatch_78.mlir:2:2: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100", ukernels = "none"}>
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100", ukernels = "none"}>) {

Steps to reproduce your issue

iree-compile.exe dispatch_78.mlir --iree-input-type=torch --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=rocm --mlir-print-debuginfo --iree-hal-target-backends=rocm --iree-rocm-target-chip=gfx1100 --iree-opt-const-eval=false --iree-opt-data-tiling=False

What component(s) does this issue relate to?

No response

Version information

IREE (https://iree.dev): openxla/iree@5d8907e82f LLVM version 19.0.0git Optimized build with assertions

Additional context

No response

Here's updated IR for config A unet, as well as the attention dispatch that fails to lower. It seems we have two options:

have wmma implementation of attention, either by TD script or by C++ pipeline
fix tiling and decomposition of attention ops for this target

The attention dispatch:

module attributes {hal.device.targets = [#hal.device.target<"rocm", {legacy_sync}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100", ukernels = "none"}>]>]} {
  hal.executable private @run_forward$async_dispatch_73 {
    hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100", ukernels = "none"}>) {
      hal.executable.export public @run_forward$async_dispatch_73_attention_20x4096x64xf16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_73_attention_20x4096x64xf16() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [64, 1, 1] subgroup_size = 32>} {
          %cst = arith.constant 1.250000e-01 : f16
          %0 = hal.interface.constant.load[0] : i32
          %1 = hal.interface.constant.load[1] : i32
          %2 = hal.interface.constant.load[2] : i32
          %3 = hal.interface.constant.load[3] : i32
          %4 = arith.index_castui %0 : i32 to index
          %5 = arith.index_castui %1 : i32 to index
          %6 = arith.index_castui %2 : i32 to index
          %7 = arith.index_castui %3 : i32 to index
          %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
          %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
          %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
          %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
          %12 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
          %13 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
          %14 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
          %15 = tensor.empty() : tensor<20x4096x64xf16>
          %16 = iree_linalg_ext.attention {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64]]>} ins(%12, %13, %14, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%15 : tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16>
          flow.dispatch.tensor.store %16, %11, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
          return
        }
      }
    }
  }
  util.global private mutable @run_forward$async_dispatch_73_rocm_hsaco_fb_run_forward$async_dispatch_73_attention_20x4096x64xf16_buffer : !hal.buffer
  util.initializer {
    %c846622208 = arith.constant 846622208 : index
    %c-1_i64 = arith.constant -1 : i64
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c846622208}
    util.global.store %buffer, @run_forward$async_dispatch_73_rocm_hsaco_fb_run_forward$async_dispatch_73_attention_20x4096x64xf16_buffer : !hal.buffer
    util.return
  }
  util.func public @run_forward$async_dispatch_73_rocm_hsaco_fb_run_forward$async_dispatch_73_attention_20x4096x64xf16(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c423311104 = arith.constant 423311104 : index
    %c1 = arith.constant 1 : index
    %c423310976 = arith.constant 423310976 : index
    %c106756736_i32 = arith.constant 106756736 : i32
    %c180157056_i32 = arith.constant 180157056 : i32
    %c169671296_i32 = arith.constant 169671296 : i32
    %c10490944_i32 = arith.constant 10490944 : i32
    %c0 = arith.constant 0 : index
    %0 = arith.index_cast %arg0 : i32 to index
    %device_0 = hal.devices.get %c0 : !hal.device
    %cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c10490944_i32, %c169671296_i32, %c180157056_i32, %c106756736_i32]) : i32, i32, i32, i32
    %run_forward$async_dispatch_73_rocm_hsaco_fb_run_forward$async_dispatch_73_attention_20x4096x64xf16_buffer = util.global.load @run_forward$async_dispatch_73_rocm_hsaco_fb_run_forward$async_dispatch_73_attention_20x4096x64xf16_buffer : !hal.buffer
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%run_forward$async_dispatch_73_rocm_hsaco_fb_run_forward$async_dispatch_73_attention_20x4096x64xf16_buffer : !hal.buffer)[%c0, %c423310976], 
      %c1 = (%run_forward$async_dispatch_73_rocm_hsaco_fb_run_forward$async_dispatch_73_attention_20x4096x64xf16_buffer : !hal.buffer)[%c423311104, %c423310976]
    ])
    %workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@run_forward$async_dispatch_73::@rocm_hsaco_fb::@run_forward$async_dispatch_73_attention_20x4096x64xf16) : index, index, index
    %exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@run_forward$async_dispatch_73) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@run_forward$async_dispatch_73::@rocm_hsaco_fb::@run_forward$async_dispatch_73_attention_20x4096x64xf16) : index
    scf.for %arg1 = %c0 to %0 step %c1 {
      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    }
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %1 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    util.return
  }
}

Full model in torch dialect IR: stable_diffusion_xl_base_1_0_PNDM_64_1024x1024_fp16_unet_30.mlir.txt

Error output:

iree.compiler.tools.binaries.CompilerToolError: Error invoking IREE compiler tool iree-compile.exe
Error code: 1
Diagnostics:
failed to translate executables
./unet_gfx1100_a_dps\configured_compiled_scheduled_unet_run_forward$async_dispatch_73.mlir:9:6: error: Failures have been detected while processing an MLIR pass pipeline
      func.func @run_forward$async_dispatch_73_attention_20x4096x64xf16() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [64, 1, 1] subgroup_size = 32>} {
     ^
./unet_gfx1100_a_dps\configured_compiled_scheduled_unet_run_forward$async_dispatch_73.mlir:9:6: note: Pipeline failed while executing [`TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_15, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_21, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_51, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_58, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_59, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_60, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_64, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `LLVMGPULowerExecutableTarget` on 'func.func' operation: @run_forward$async_dispatch_64_matmul_transpose_b_8192x640x640_f16xf16xf32, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_65, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `LowerExecutableUsingTransformDialect` on 'builtin.module' operation, `TransformDialectInterpreter` on 'builtin.module' operation, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_66, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_67, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `LLVMGPULowerExecutableTarget` on 'func.func' operation: @run_forward$async_dispatch_67_matmul_transpose_b_8192x640x640_f16xf16xf32, `LLVMGPUVectorLowering` on 'func.func' operation: @run_forward$async_dispatch_51_conv_2d_nhwc_hwcf_2x64x64x640x3x3x320_f16xf16xf32, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_70, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `TranslateExecutablesPass` on 'hal.executable' operation: @run_forward$async_dispatch_73, `TranslateTargetExecutableVariantsPass` on 'hal.executable.variant' operation: @rocm_hsaco_fb, `LLVMGPULowerExecutableTarget` on 'func.func' operation: @run_forward$async_dispatch_70_generic_2x10x4096x64_f32xf16, `FoldMemRefAliasOps` on 'func.func' operation: @run_forward$async_dispatch_64_matmul_transpose_b_8192x640x640_f16xf16xf32, `LLVMGPUVectorDistribute` on 'func.func' operation: @run_forward$async_dispatch_67_matmul_transpose_b_8192x640x640_f16xf16xf32, `ConvertToROCDL` on 'builtin.module' operation, `ConvertToROCDL` on 'builtin.module' operation, `ExpandStridedMetadata` on 'func.func' operation: @run_forward$async_dispatch_21_conv_2d_nhwc_hwcf_2x128x128x320x3x3x320_f16xf16xf32, `LinalgExtToLoops` on 'func.func' operation: @run_forward$async_dispatch_73_attention_20x4096x64xf16, `LinalgExtToLoops` on 'func.func' operation: @run_forward$async_dispatch_59_conv_2d_nhwc_hwcf_2x64x64x640x1x1x320_f16xf16xf32, `Canonicalizer` on 'func.func' operation: @run_forward$async_dispatch_58_conv_2d_nhwc_hwcf_2x64x64x640x3x3x640_f16xf16xf32, `Canonicalizer` on 'func.func' operation: @run_forward$async_dispatch_70_generic_2x10x4096x64_f32xf16, `ConvertAffineToStandard` on 'func.func' operation: @run_forward$async_dispatch_66_generic_8192x640_f16xf32xf32xf16xf16xf16]: reproducer generated at `./shark_tmp/core-reproducer.mlir`
./unet_gfx1100_a_dps\configured_compiled_scheduled_unet_run_forward$async_dispatch_73.mlir:2:2: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100", ukernels = "none"}>
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100", ukernels = "none"}>) {
 ^

Invoked with:
 iree-compile.exe C:\V\iree-build\compiler\bindings\python\iree\compiler\tools\..\_mlir_libs\iree-compile.exe stable_diffusion_xl_base_1_0_PNDM_64_1024x1024_fp16_unet_30.mlir --iree-input-type=torch --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=rocm --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=./shark_tmp/core-reproducer.mlir --iree-hal-target-backends=rocm --iree-rocm-target-chip=gfx1100 --iree-opt-const-eval=false --iree-opt-data-tiling=False --iree-hal-dump-executable-files-to=./unet_gfx1100_a_dps --mlir-print-debuginfo=false --iree-codegen-gpu-native-math-precision=true --iree-codegen-llvmgpu-use-vector-distribution --iree-global-opt-propagate-transposes=true --iree-opt-outer-dim-concat=true --iree-vm-target-truncate-unsupported-floats --iree-llvmgpu-enable-prefetch=true --verify=false --iree-opt-data-tiling=false --iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics)

Need more information? Set IREE_SAVE_TEMPS=/some/dir in your environment to save all artifacts and reproducers.

iree-org / iree