iree-org / iree

A retargetable MLIR-based machine learning compiler and runtime toolkit.
http://iree.dev/
Apache License 2.0
2.56k stars 572 forks source link

reshape-like pack/unpack ops are not folded into reshapes #16181

Closed hanhanW closed 7 months ago

hanhanW commented 7 months ago

I observed that some pack ops not folded into reshape when I am working on https://github.com/openxla/iree/pull/16160. Below has some examples from EfficientNetV2STF model. There are unit dims, and the inner tile sizes are all ones, they can be simplified to a reshape op.

hal.executable public @forward_dispatch_158 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "+cmov,+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+crc32,+f16c,+fsgsbase,+fxsr,+invpcid,+lzcnt,+movbe,+pku,+prfchw,+rdrnd,+rdseed,+sahf,+x87,+xsave,+xsavec,+xsaveopt,+xsaves,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>) {
    hal.executable.export public @forward_dispatch_158_pack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @forward_dispatch_158_pack_f32() {
        %c1769472 = arith.constant 1769472 : index
        %c1772544 = arith.constant 1772544 : index
        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1769472) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x768xf32>>
        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c1772544) : !flow.dispatch.tensor<writeonly:tensor<1x768x1x1xf32>>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 768], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x768xf32>> -> tensor<1x768xf32>
        %3 = tensor.empty() : tensor<1x768x1x1xf32>
        %pack = tensor.pack %2 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %3 : tensor<1x768xf32> -> tensor<1x768x1x1xf32>
        flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [1, 768, 1, 1], strides = [1, 1, 1, 1] : tensor<1x768x1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x768x1x1xf32>>
        return
      }
    }
  }
}

and

hal.executable public @forward_dispatch_112 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "+cmov,+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+crc32,+f16c,+fsgsbase,+fxsr,+invpcid,+lzcnt,+movbe,+pku,+prfchw,+rdrnd,+rdseed,+sahf,+x87,+xsave,+xsavec,+xsaveopt,+xsaves,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>) {
    hal.executable.export public @forward_dispatch_112_unpack_generic_1x32_f32_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @forward_dispatch_112_unpack_generic_1x32_f32_pack() {
        %cst = arith.constant 1.000000e+00 : f32
        %cst_0 = arith.constant dense<[[0.0471783765, 0.0356840268, 0.0537908673, 0.0335887484, 0.0172067452, 0.0232766438, 0.0657389387, 0.0271849073, 0.0399738103, 0.0495807044, 0.0403049253, 0.0480787233, 0.0293508619, 0.0442568138, 0.0142606916, 0.0472004376, 0.0503993146, 0.0452328138, 0.041096665, 0.0567244478, 0.00972147285, 0.0320480205, 0.0363559797, 5.690680e-02, 0.0240052529, 0.0445989519, 0.0272407513, 0.026660515, 0.0290578324, -0.00343443151, 0.0346870795, 0.0460179821]]> : tensor<1x32xf32>
        %c0 = arith.constant 0 : index
        %c128 = arith.constant 128 : index
        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x2x1x16xf32>>
        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c128) : !flow.dispatch.tensor<writeonly:tensor<1x32x1x1xf32>>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 2, 1, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x2x1x16xf32>> -> tensor<1x2x1x16xf32>
        %3 = tensor.empty() : tensor<1x32x1x1xf32>
        %4 = tensor.empty() : tensor<1x32xf32>
        %unpack = tensor.unpack %2 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %4 : tensor<1x2x1x16xf32> -> tensor<1x32xf32>
        %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack, %cst_0 : tensor<1x32xf32>, tensor<1x32xf32>) outs(%4 : tensor<1x32xf32>) {
        ^bb0(%in: f32, %in_1: f32, %out: f32):
          %6 = arith.addf %in, %in_1 : f32
          %7 = arith.negf %6 : f32
          %8 = math.exp %7 : f32
          %9 = arith.addf %8, %cst : f32
          %10 = arith.divf %cst, %9 : f32
          %11 = arith.mulf %6, %10 : f32
          linalg.yield %11 : f32
        } -> tensor<1x32xf32>
        %pack = tensor.pack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %3 : tensor<1x32xf32> -> tensor<1x32x1x1xf32>
        flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [1, 32, 1, 1], strides = [1, 1, 1, 1] : tensor<1x32x1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x32x1x1xf32>>
        return
      }
    }
  }
}
hanhanW commented 7 months ago

This is fixed by https://github.com/llvm/llvm-project/commit/f59eef6515433577d757cf64d2d2f402d95a689e and https://github.com/llvm/llvm-project/commit/ad3cda7a04d4858816cbf38df63dc86d370c2587