[Comipler] 'func.func' op exceeded stack allocation limit

AmosLewis commented 2 months ago

What happened?

When compiler the public onnxstorage quantized onnx/models/ConvNeXt_vaiq_int8 model, it failed with error ./dispatch/module_torch_jit_dispatch_13.mlir:9:6: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 401408 bytes.

I dumped the failed dispatch by iree-compile --iree-input-demote-i64-to-i32 --iree-hal-target-backends=llvm-cpu ConvNeXt_vaiq_int8.default.onnx.linalg.mlir > ConvNeXt_vaiq_int8.default.vmfb --iree-hal-dump-executable-sources-to=./dispatch:

module_torch_jit_dispatch_13.mlir:

hal.executable public @torch_jit_dispatch_13 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @torch_jit_dispatch_13_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @torch_jit_dispatch_13_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32() {
        %c0_i32 = arith.constant 0 : i32
        %cst = arith.constant 9.765625E-4 : f32
        %cst_0 = arith.constant 1.250000e-01 : f32
        %cst_1 = arith.constant 0.000000e+00 : f32
        %cst_2 = arith.constant -1.280000e+02 : f32
        %cst_3 = arith.constant 1.270000e+02 : f32
        %cst_4 = arith.constant 1.41421354 : f32
        %cst_5 = arith.constant 1.000000e+00 : f32
        %cst_6 = arith.constant 5.000000e-01 : f32
        %c0 = arith.constant 0 : index
        %c3625216 = arith.constant 3625216 : index
        %c99112512 = arith.constant 99112512 : index
        %c2019584 = arith.constant 2019584 : index
        %0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<56x56x128xi8>>
        %1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c3625216) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<56x128x512xi8>>
        %2 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c99112512) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf32>>
        %3 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c2019584) : !flow.dispatch.tensor<writeonly:tensor<56x56x512xi8>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [56, 56, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<56x56x128xi8>> -> tensor<56x56x128xi8>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [56, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<56x128x512xi8>> -> tensor<56x128x512xi8>
        %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf32>> -> tensor<512xf32>
        %7 = tensor.empty() : tensor<56x56x512xi8>
        %8 = tensor.empty() : tensor<56x56x512xi32>
        %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<56x56x512xi32>) -> tensor<56x56x512xi32>
        %10 = linalg.quantized_batch_matmul ins(%4, %5, %c0_i32, %c0_i32 : tensor<56x56x128xi8>, tensor<56x128x512xi8>, i32, i32) outs(%9 : tensor<56x56x512xi32>) -> tensor<56x56x512xi32>
        %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%6, %10 : tensor<512xf32>, tensor<56x56x512xi32>) outs(%7 : tensor<56x56x512xi8>) {
        ^bb0(%in: f32, %in_7: i32, %out: i8):
          %12 = arith.sitofp %in_7 : i32 to f32
          %13 = arith.mulf %12, %cst : f32
          %14 = arith.addf %in, %13 : f32
          %15 = arith.divf %14, %cst_0 : f32
          %16 = math.roundeven %15 : f32
          %17 = arith.addf %16, %cst_1 : f32
          %18 = arith.maximumf %17, %cst_2 : f32
          %19 = arith.minimumf %18, %cst_3 : f32
          %20 = arith.fptosi %19 : f32 to i8
          %21 = arith.extsi %20 : i8 to i32
          %22 = arith.sitofp %21 : i32 to f32
          %23 = arith.mulf %22, %cst_0 : f32
          %24 = arith.divf %23, %cst_4 : f32
          %25 = math.erf %24 : f32
          %26 = arith.addf %25, %cst_5 : f32
          %27 = arith.mulf %23, %26 : f32
          %28 = arith.mulf %27, %cst_6 : f32
          %29 = arith.divf %28, %cst_0 : f32
          %30 = math.roundeven %29 : f32
          %31 = arith.addf %30, %cst_1 : f32
          %32 = arith.maximumf %31, %cst_2 : f32
          %33 = arith.minimumf %32, %cst_3 : f32
          %34 = arith.fptosi %33 : f32 to i8
          linalg.yield %34 : i8
        } -> tensor<56x56x512xi8>
        flow.dispatch.tensor.store %11, %3, offsets = [0, 0, 0], sizes = [56, 56, 512], strides = [1, 1, 1] : tensor<56x56x512xi8> -> !flow.dispatch.tensor<writeonly:tensor<56x56x512xi8>>
        return
      }
    }
  }
}

and more failed dispatch: module_torch_jit_dispatch_15.mlir, module_torch_jit_dispatch_23.mlir, module_torch_jit_dispatch_25.mlir, module_torch_jit_dispatch_33.mlir.

Steps to reproduce your issue

iree-compile --iree-input-demote-i64-to-i32 --iree-hal-target-backends=llvm-cpu ./dispatch/module_torch_jit_dispatch_13.mlir > o.vmfb

What component(s) does this issue relate to?

Compiler

Version information

candidate-20240809.980

Additional context

No response

hanhanW commented 2 months ago

I thought that linalg.quantized_batch_matmul should all be decomposed to a sequence of linalg op? @pashu123 could you take a look?

https://github.com/iree-org/iree/blob/main/compiler/src/iree/compiler/InputConversion/Common/QuantizedMatmulToMatmul.cpp

pashu123 commented 1 month ago

@hanhanW @pdhirajkumarprasad I don't see this issue on the latest branch.

pashu123 commented 1 month ago

@AmosLewis Could you double check on the main branch?

AmosLewis commented 1 month ago

@pashu123 the issue gone with the latest branch.

iree-org / iree