Avoid creating allocation during bufferization

MaheshRavishankar commented 1 year ago

IREE at ToM currently can fuse softmax into a single dispatch under the --iree-flow-enable-aggresive-fusion. (https://github.com/iree-org/iree/blob/5de1a38f6eb8b8e4bfbba05d8b29246e968747a3/tests/e2e/regression/BUILD#L148). While this works, it does mean that there is a stack allocation created for this code. Here is the IR before vectorization

module {
  func.func @_softmax_dispatch_0_generic_12x128x128() {
    %cst = arith.constant dense<-3.40282347E+38> : vector<1x4xf32>
    %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : vector<4x1x4xf32>
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c32 = arith.constant 32 : index
    %c12 = arith.constant 12 : index
    %c128 = arith.constant 128 : index
    %c0 = arith.constant 0 : index
    %cst_2 = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<12x128x128xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<12x128x128xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
    %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
    scf.for %arg0 = %2 to %c12 step %3 {
      scf.for %arg1 = %4 to %c128 step %5 {
        %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1, 0], sizes = [4, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<12x128x128xf32>> -> tensor<4x32x128xf32>
        %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [4, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x128xf32>> -> tensor<4x32x128xf32>
        %8 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %6) -> (tensor<4x32x128xf32>) {
          %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<4x32x128xf32>) {
            %10 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %cst) -> (vector<1x4xf32>) {
              %18 = vector.transfer_read %7[%arg2, %arg4, %arg6], %cst_2 {in_bounds = [true, true, true]} : tensor<4x32x128xf32>, vector<1x4x4xf32>
              %19 = vector.multi_reduction <maxf>, %18, %arg7 [2] : vector<1x4x4xf32> to vector<1x4xf32>
              scf.yield %19 : vector<1x4xf32>
            }
            %extracted_slice = tensor.extract_slice %6[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
            %11 = vector.broadcast %10 : vector<1x4xf32> to vector<4x1x4xf32>
            %12 = vector.transpose %11, [1, 2, 0] : vector<4x1x4xf32> to vector<1x4x4xf32>
            %13:2 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %extracted_slice, %arg8 = %cst_0) -> (tensor<1x4x128xf32>, vector<1x4xf32>) {
              %18 = vector.transfer_read %7[%arg2, %arg4, %arg6], %cst_2 {in_bounds = [true, true, true]} : tensor<4x32x128xf32>, vector<1x4x4xf32>
              %19 = arith.subf %18, %12 : vector<1x4x4xf32>
              %20 = math.exp %19 : vector<1x4x4xf32>
              %21 = vector.multi_reduction <add>, %20, %arg8 [2] : vector<1x4x4xf32> to vector<1x4xf32>
              %22 = vector.transfer_write %20, %arg7[%c0, %c0, %arg6] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, tensor<1x4x128xf32>
              scf.yield %22, %21 : tensor<1x4x128xf32>, vector<1x4xf32>
            }
            %extracted_slice_3 = tensor.extract_slice %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
            %14 = vector.broadcast %13#1 : vector<1x4xf32> to vector<4x1x4xf32>
            %15 = arith.divf %cst_1, %14 : vector<4x1x4xf32>
            %16 = vector.transpose %15, [1, 2, 0] : vector<4x1x4xf32> to vector<1x4x4xf32>
            %17 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %extracted_slice_3) -> (tensor<1x4x128xf32>) {
              %18 = vector.transfer_read %13#0[%c0, %c0, %arg6], %cst_2 {in_bounds = [true, true, true]} : tensor<1x4x128xf32>, vector<1x4x4xf32>
              %19 = arith.mulf %18, %16 : vector<1x4x4xf32>
              %20 = vector.transfer_write %19, %arg7[%c0, %c0, %arg6] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, tensor<1x4x128xf32>
              scf.yield %20 : tensor<1x4x128xf32>
            }
            %inserted_slice = tensor.insert_slice %17 into %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<1x4x128xf32> into tensor<4x32x128xf32>
            scf.yield %inserted_slice : tensor<4x32x128xf32>
          }
          scf.yield %9 : tensor<4x32x128xf32>
        }
        flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1, 0], sizes = [4, 32, 128], strides = [1, 1, 1] : tensor<4x32x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x128xf32>>
      }
    }
    return
  }
}

It is easier to explain the issue if we go a bit earlier in the compilation

      %9 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %7) -> (tensor<4x32x128xf32>) {
        %10 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<4x32x128xf32>) {
          %extracted_slice = tensor.extract_slice %8[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
          %11 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1x4xf32>) -> tensor<1x4xf32>
          %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice : tensor<1x4x128xf32>) outs(%11 : tensor<1x4xf32>) {
          ^bb0(%in: f32, %out: f32):
            %16 = arith.maxf %in, %out : f32
            linalg.yield %16 : f32
          } -> tensor<1x4xf32>
          %extracted_slice_2 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
          %13 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<1x4xf32>) -> tensor<1x4xf32>
          %14:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice, %12 : tensor<1x4x128xf32>, tensor<1x4xf32>) outs(%extracted_slice_2, %13 : tensor<1x4x128xf32>, tensor<1x4xf32>) {
          ^bb0(%in: f32, %in_4: f32, %out: f32, %out_5: f32):
            %16 = arith.subf %in, %in_4 : f32
            %17 = math.exp %16 : f32
            %18 = arith.addf %17, %out_5 : f32
            linalg.yield %17, %18 : f32, f32
          } -> (tensor<1x4x128xf32>, tensor<1x4xf32>)
          %extracted_slice_3 = tensor.extract_slice %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
          %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%14#0, %14#1 : tensor<1x4x128xf32>, tensor<1x4xf32>) outs(%extracted_slice_3 : tensor<1x4x128xf32>) {
          ^bb0(%in: f32, %in_4: f32, %out: f32):
            %16 = arith.divf %cst_1, %in_4 : f32
            %17 = arith.mulf %in, %16 : f32
            linalg.yield %17 : f32
          } -> tensor<1x4x128xf32>
          %inserted_slice = tensor.insert_slice %15 into %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<1x4x128xf32> into tensor<4x32x128xf32>
          scf.yield %inserted_slice : tensor<4x32x128xf32>
        }
        scf.yield %10 : tensor<4x32x128xf32>
      }

The stack allocation comes due to %14#0 . This is not in destination passing style, and really there is no destination that can be used to convert it into that.... I was wondering though if there is a way we can make bufferization smarter to handle this case (either during bufferization, or as a preprocessing, I know there is an EliminateInitTensors pass that does some things, maybe there).

In this case, %14#0 is dead after the use in %15. The last operation is all parallel, so you should be able to reuse the operand for outs for the ins as well.... I know we said we can do this in IREE, but currently the conversion to destination passing style happens very early in IREE, and it muddles up the specification of the computation that is hard to entangle and affects the way other intervening passes work.... Hoping for a better solution that is more targeted, or run just before bufferization.

matthias-springer commented 1 year ago

This doesn't fix the issue yet but could this line:

            %extracted_slice = tensor.extract_slice %6[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>

be changed to this line?

            %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>

The difficult part is for the bufferization to understand that this loop is not reading from %arg7:

            %17 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %extracted_slice_3) -> (tensor<1x4x128xf32>) {
              %18 = vector.transfer_read %13#0[%c0, %c0, %arg6], %cst_2 {in_bounds = [true, true, true]} : tensor<1x4x128xf32>, vector<1x4x4xf32>
              %19 = arith.mulf %18, %16 : vector<1x4x4xf32>
              %20 = vector.transfer_write %19, %arg7[%c0, %c0, %arg6] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, tensor<1x4x128xf32>
              scf.yield %20 : tensor<1x4x128xf32>
            }

By just looking at this line:

              %20 = vector.transfer_write %19, %arg7[%c0, %c0, %arg6] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, tensor<1x4x128xf32>

We are reading from %arg7. (Reading everything apart from [%arg6; %arg6+4).) Only when we see the loop structure, it becomes obvious that %arg7 is completely overwritten overall.

This is not a problem of destination-passing style. We need a range analysis. I have to think about it a bit to see if we can get away with a simpler analysis (because we only care whether %arg7 is written entirely or not, we don't care about the exact ranges). But the general problem here is a tricky one and has been a long-standing TODO for the bufferization.

matthias-springer commented 1 year ago

If you can bring the IR into this form before bufferization, it would bufferize without a copy. Then we don't have to solve the range analysis issue yet.

    %8 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %6) -> (tensor<4x32x128xf32>) {
      %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<4x32x128xf32>) {
        %10 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %cst) -> (vector<1x4xf32>) {
          %18 = vector.transfer_read %7[%arg2, %arg4, %arg6], %cst_2 {in_bounds = [true, true, true]} : tensor<4x32x128xf32>, vector<1x4x4xf32>
          %19 = vector.multi_reduction <maxf>, %18, %arg7 [2] : vector<1x4x4xf32> to vector<1x4xf32>
          scf.yield %19 : vector<1x4xf32>
        }

        // THIS IS NOW USING %arg5 instead of %6
        %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>

        %11 = vector.broadcast %10 : vector<1x4xf32> to vector<4x1x4xf32>
        %12 = vector.transpose %11, [1, 2, 0] : vector<4x1x4xf32> to vector<1x4x4xf32>
        %13:2 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %extracted_slice, %arg8 = %cst_0) -> (tensor<1x4x128xf32>, vector<1x4xf32>) {
          %18 = vector.transfer_read %7[%arg2, %arg4, %arg6], %cst_2 {in_bounds = [true, true, true]} : tensor<4x32x128xf32>, vector<1x4x4xf32>
          %19 = arith.subf %18, %12 : vector<1x4x4xf32>
          %20 = math.exp %19 : vector<1x4x4xf32>
          %21 = vector.multi_reduction <add>, %20, %arg8 [2] : vector<1x4x4xf32> to vector<1x4xf32>
          %22 = vector.transfer_write %20, %arg7[%c0, %c0, %arg6] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, tensor<1x4x128xf32>
          scf.yield %22, %21 : tensor<1x4x128xf32>, vector<1x4xf32>
        }

        // DELETED THE OTHER EXTRACT_SLICE HERE

        %14 = vector.broadcast %13#1 : vector<1x4xf32> to vector<4x1x4xf32>
        %15 = arith.divf %cst_1, %14 : vector<4x1x4xf32>
        %16 = vector.transpose %15, [1, 2, 0] : vector<4x1x4xf32> to vector<1x4x4xf32>

        // USE %13#0 AS ITER_ARG HERE
        %17 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %13#0) -> (tensor<1x4x128xf32>) {
          // READ FROM %arg7
          %18 = vector.transfer_read %arg7[%c0, %c0, %arg6], %cst_2 {in_bounds = [true, true, true]} : tensor<1x4x128xf32>, vector<1x4x4xf32>
          %19 = arith.mulf %18, %16 : vector<1x4x4xf32>
          %20 = vector.transfer_write %19, %arg7[%c0, %c0, %arg6] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, tensor<1x4x128xf32>
          scf.yield %20 : tensor<1x4x128xf32>
        }
        %inserted_slice = tensor.insert_slice %17 into %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<1x4x128xf32> into tensor<4x32x128xf32>
        scf.yield %inserted_slice : tensor<4x32x128xf32>
      }
      scf.yield %9 : tensor<4x32x128xf32>
    }

MaheshRavishankar commented 1 year ago

Thanks Matthias, (and sorry for the delay, I was out the whole month and wanted to some time to respond to this). So to summarize you are saying

          %extracted_slice_2 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
          %13 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<1x4xf32>) -> tensor<1x4xf32>
          %14:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice, %12 : tensor<1x4x128xf32>, tensor<1x4xf32>) outs(%extracted_slice_2, %13 : tensor<1x4x128xf32>, tensor<1x4xf32>) {
          ^bb0(%in: f32, %in_4: f32, %out: f32, %out_5: f32):
            %16 = arith.subf %in, %in_4 : f32
            %17 = math.exp %16 : f32
            %18 = arith.addf %17, %out_5 : f32
            linalg.yield %17, %18 : f32, f32
          } -> (tensor<1x4x128xf32>, tensor<1x4xf32>)
          %extracted_slice_3 = tensor.extract_slice %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
          %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%14#0, %14#1 : tensor<1x4x128xf32>, tensor<1x4xf32>) outs(%extracted_slice_3 : tensor<1x4x128xf32>) {
          ^bb0(%in: f32, %in_4: f32, %out: f32):
            %16 = arith.divf %cst_1, %in_4 : f32
            %17 = arith.mulf %in, %16 : f32
            linalg.yield %17 : f32
          } -> tensor<1x4x128xf32>

should be modified to

          %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
          %13 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<1x4xf32>) -> tensor<1x4xf32>
          %14:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice, %12 : tensor<1x4x128xf32>, tensor<1x4xf32>) outs(%extracted_slice_2, %13 : tensor<1x4x128xf32>, tensor<1x4xf32>) {
          ^bb0(%in: f32, %in_4: f32, %out: f32, %out_5: f32):
            %16 = arith.subf %in, %in_4 : f32
            %17 = math.exp %16 : f32
            %18 = arith.addf %17, %out_5 : f32
            linalg.yield %17, %18 : f32, f32
          } -> (tensor<1x4x128xf32>, tensor<1x4xf32>)
          %extracted_slice_3 = tensor.extract_slice %arg5[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<4x32x128xf32> to tensor<1x4x128xf32>
          %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%14#0, %14#1 : tensor<1x4x128xf32>, tensor<1x4xf32>) outs(%extracted_slice_3 : tensor<1x4x128xf32>) {
          ^bb0(%in: f32, %in_4: f32, %out: f32):
            %16 = arith.divf %cst_1, %in_4 : f32
            %17 = arith.mulf %in, %16 : f32
            linalg.yield %17 : f32
          } -> tensor<1x4x128xf32>

That basically gets it into destination passing style. But to get it into that format I need to do range analysis (previously this was being done during tile and fuse, but that is actually semantically challenging to do in tile and fuse and maintaining destination passing style correctly. This might be a simple instance of range analysis that could be incorporated into bufferization (or pre-bufferization) cause the analysis required would be similar to what bufferization already does....

allieculp commented 1 year ago

@matthias-springer Any update on this issue?

iree-org / iree

Avoid creating allocation during bufferization #11412