iree-org / iree

A retargetable MLIR-based machine learning compiler and runtime toolkit.
http://iree.dev/
Apache License 2.0
2.8k stars 604 forks source link

`iree-compile` generates unwanted `memref.alloca` instruction for models with dynamic dims #18881

Open vinayakdsci opened 2 days ago

vinayakdsci commented 2 days ago

For the following disptach:

hal.executable public @main_graph$async_dispatch_9 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "icelake-server", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,+rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,-avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,-pku,-nf,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,-rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,-shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_graph$async_dispatch_9_elementwise_DxDx768_f32_pack ordinal(0) layout(#hal.pipeline.layout<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_graph$async_dispatch_9_elementwise_DxDx768_f32_pack() {
        %c32_i64 = arith.constant 32 : i64
        %cst = arith.constant 0.000000e+00 : f32
        %c1_i64 = arith.constant 1 : i64
        %c0_i64 = arith.constant 0 : i64
        %cst_0 = arith.constant 9.99999974E-6 : f32
        %cst_1 = arith.constant 7.680000e+02 : f32
        %cst_2 = arith.constant 2.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
        %6 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
        %7 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
        %8 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
        %9 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
        %10 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
        %11 = hal.interface.constant.load layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
        %12 = arith.extui %0 : i32 to i64
        %13 = arith.extui %1 : i32 to i64
        %14 = arith.shli %13, %c32_i64 : i64
        %15 = arith.ori %12, %14 : i64
        %16 = arith.index_castui %15 {stream.alignment = 64 : index} : i64 to index
        %17 = arith.extui %2 : i32 to i64
        %18 = arith.extui %3 : i32 to i64
        %19 = arith.shli %18, %c32_i64 : i64
        %20 = arith.ori %17, %19 : i64
        %21 = arith.index_castui %20 : i64 to index
        %22 = arith.extui %4 : i32 to i64
        %23 = arith.extui %5 : i32 to i64
        %24 = arith.shli %23, %c32_i64 : i64
        %25 = arith.ori %22, %24 : i64
        %26 = arith.index_castui %25 : i64 to index
        %27 = arith.extui %6 : i32 to i64
        %28 = arith.extui %7 : i32 to i64
        %29 = arith.shli %28, %c32_i64 : i64
        %30 = arith.ori %27, %29 : i64
        %31 = arith.index_castui %30 : i64 to index
        %32 = arith.extui %8 : i32 to i64
        %33 = arith.extui %9 : i32 to i64
        %34 = arith.shli %33, %c32_i64 : i64
        %35 = arith.ori %32, %34 : i64
        %36 = arith.index_castui %35 : i64 to index
        %37 = arith.extui %10 : i32 to i64
        %38 = arith.extui %11 : i32 to i64
        %39 = arith.shli %38, %c32_i64 : i64
        %40 = arith.ori %37, %39 : i64
        %41 = arith.index_castui %40 : i64 to index
        %42:5 = util.assume.int 
            %c0<umin = 0, umax = 0>, 
            %c0<umin = 0, umax = 0>, 
            %c0<umin = 0, umax = 0>, 
            %c0<umin = 0, umax = 0>, 
            %c0<umin = 0, umax = 0>
          : index, index, index, index, index
        %43 = hal.interface.binding.subspan layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%42#1) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<768xf32>>
        %44 = hal.interface.binding.subspan layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%42#2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<768xf32>>
        %45 = flow.dispatch.workload.ordinal %31, 2 : index
        %46 = flow.dispatch.workload.ordinal %36, 3 : index
        %47 = flow.dispatch.workload.ordinal %42#4, 4 : index
        %48 = flow.dispatch.workload.ordinal %41, 5 : index
        %49 = hal.interface.binding.subspan layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%16) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x768xf32>>{%45, %46}
        %50 = hal.interface.binding.subspan layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%42#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x768x16x1xf32>>{%47, %48}
        %51 = hal.interface.binding.subspan layout(<constants = 12, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%42#3) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x?x768x16x1xf32>>{%47, %48}
        %52 = flow.dispatch.workload.ordinal %21, 0 : index
        %53 = flow.dispatch.workload.ordinal %26, 1 : index
        %54 = flow.dispatch.tensor.load %49, offsets = [0, 0, 0], sizes = [%45, %46, 768], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x768xf32>>{%45, %46} -> tensor<?x?x768xf32>
        %55 = flow.dispatch.tensor.load %43, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<768xf32>> -> tensor<768xf32>
        %56 = flow.dispatch.tensor.load %44, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<768xf32>> -> tensor<768xf32>
        %57 = flow.dispatch.tensor.load %50, offsets = [0, 0, 0, 0, 0], sizes = [%47, %48, 768, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x768x16x1xf32>>{%47, %48} -> tensor<?x?x768x16x1xf32>
        %58 = arith.index_cast %52 : index to i64
        %59 = arith.index_cast %53 : index to i64
        %60 = arith.cmpi slt, %58, %c0_i64 : i64
        %61 = arith.muli %59, %58 : i64
        %62 = arith.select %60, %c1_i64, %58 : i64
        %63 = arith.divsi %61, %62 : i64
        %64 = arith.select %60, %63, %58 : i64
        %65 = arith.index_cast %64 : i64 to index
        %66 = arith.index_cast %63 : i64 to index
        %67 = tensor.empty(%66, %65) : tensor<?x?xf32>
        %68 = tensor.empty(%66, %65) : tensor<?x?x768xf32>
        %69 = linalg.fill ins(%cst : f32) outs(%67 : tensor<?x?xf32>) -> tensor<?x?xf32>
        %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%54 : tensor<?x?x768xf32>) outs(%69 : tensor<?x?xf32>) {
        ^bb0(%in: f32, %out: f32):
          %72 = math.powf %in, %cst_2 : f32
          %73 = arith.addf %72, %out : f32
          linalg.yield %73 : f32
        } -> tensor<?x?xf32>
        %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%54, %70, %55, %56 : tensor<?x?x768xf32>, tensor<?x?xf32>, tensor<768xf32>, tensor<768xf32>) outs(%68 : tensor<?x?x768xf32>) {
        ^bb0(%in: f32, %in_3: f32, %in_4: f32, %in_5: f32, %out: f32):
          %72 = arith.divf %in_3, %cst_1 : f32
          %73 = arith.addf %72, %cst_0 : f32
          %74 = math.sqrt %73 : f32
          %75 = arith.divf %in, %74 : f32
          %76 = arith.mulf %75, %in_4 : f32
          %77 = arith.addf %76, %in_5 : f32
          linalg.yield %77 : f32
        } -> tensor<?x?x768xf32>
        %pack = tensor.pack %71 padding_value(%cst : f32) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %57 : tensor<?x?x768xf32> -> tensor<?x?x768x16x1xf32>
        flow.dispatch.tensor.store %pack, %51, offsets = [0, 0, 0, 0, 0], sizes = [%47, %48, 768, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x?x768x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x768x16x1xf32>>{%47, %48}
        return
      }
    }
  }
}

We have this error dump: https://gist.github.com/vinayakdsci/18b0ffb91ce40bbc026a1cfe2f2b6e9d.

The original IR used to generate the dispatch was https://gist.github.com/vinayakdsci/2e1f2dd1e6c00aed33a9c04dc4f37ee7.

The issue originates from the use of util dialect to externalize model parameters, and the crash is observable only for models with dynamic shapes.

Models with static shapes, like ResNet152_vaiq_int8 pass compile.

vinayakdsci commented 2 days ago

cc @pashu123 @MaheshRavishankar.

pashu123 commented 2 days ago

It's not generating wrong memref.alloca -- it's just generating unwanted memref.alloca that causes memory and performance issues.

vinayakdsci commented 2 days ago

It's not generating wrong memref.alloca -- it's just generating unwanted memref.alloca that causes memory and performance issues.

Ah, my bad. Fixed the title.

pashu123 commented 2 days ago

Thanks for filing the issue. I'll get back to this.