iree-org / iree

A retargetable MLIR-based machine learning compiler and runtime toolkit.
http://iree.dev/
Apache License 2.0
2.85k stars 617 forks source link

Hoist into global pass is producing different IRs on different runs of the same IR. #17500

Open pashu123 opened 5 months ago

pashu123 commented 5 months ago

What happened?

Output IR 1:

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
  util.initializer {
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    util.global.store %1, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    util.return
  }
  util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
  util.initializer {
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1xi32>) -> tensor<1xi32>
    util.global.store %1, @__hoisted_tensor_1xi32 : tensor<1xi32>
    util.return
  }
  util.global private @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
  util.global private @__hoisted_tensor_1xi32_1 : tensor<1xi32>
  util.initializer {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %__hoisted_tensor_1x8xi32 = util.global.load @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load @__hoisted_tensor_1xi32 : tensor<1xi32>
    %0:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%__hoisted_tensor_1x8xi32, %__hoisted_tensor_1xi32 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %1 = arith.addi %arg0, %arg1 : i32
      iree_linalg_ext.yield %1 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    util.global.store %0#0, @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
    util.global.store %0#1, @__hoisted_tensor_1xi32_1 : tensor<1xi32>
    util.return
  }
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
    %__hoisted_tensor_1x8xi32_0 = util.global.load immutable @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32_1 = util.global.load immutable @__hoisted_tensor_1xi32_1 : tensor<1xi32>
    %0 = hal.tensor.export %__hoisted_tensor_1xi32_1 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %0 : !hal.buffer_view
  }
}

Output IR 2:

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
  util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
  util.initializer {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    %2 = tensor.empty() : tensor<1xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
    %4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %5 = arith.addi %arg0, %arg1 : i32
      iree_linalg_ext.yield %5 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    util.global.store %4#0, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    util.global.store %4#1, @__hoisted_tensor_1xi32 : tensor<1xi32>
    util.return
  }
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
    %0 = hal.tensor.export %__hoisted_tensor_1xi32 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %0 : !hal.buffer_view
  }
}

Output IR 1successfully goes through iree-compile passes and outputs valid .vmfb, whereas Output IR 2 doesn't.

Steps to reproduce your issue

Example MLIR.

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    %2 = tensor.empty() : tensor<1xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
    %4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg1: i32, %arg2: i32):
      %6 = arith.addi %arg1, %arg2 : i32
      iree_linalg_ext.yield %6 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    %5 = hal.tensor.export %4#1 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
} 

Run : iree-opt -iree-util-hoist-into-globals above.mlir -mlir-disable-threading

Try 3-4 times to see different output IRs. The problem persists with -mlir-disable-threading.

What component(s) does this issue relate to?

Compiler

Version information

No response

Additional context

No response

pashu123 commented 5 months ago

Both the IRs are correct; it's the implementation of how topological sort works.

pashu123 commented 5 months ago

@stellaraccident Any thoughts?

AmosLewis commented 5 months ago

https://github.com/iree-org/iree/issues/17441

ScottTodd commented 5 months ago

In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?

benvanik commented 5 months ago

yeah and we definitely want output 2 - output 1 is silly

pashu123 commented 5 months ago

In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?

Happening around this loop https://github.com/iree-org/iree/blob/26e4c6b225a30466f93935e9d6d72b7a6d3f8155/compiler/src/iree/compiler/Dialect/Util/Transforms/HoistIntoGlobals.cpp#L103