[CPU] Fuse DT pack/unpack through `tensor.reshape`

dcaballe commented 1 year ago

It looks like some of the performance issues with DT are due to the pack/unpack op not being fused with their respective producers/consumers. In the example below we can see that there is a tensor.reshape between the two so probably that's what we have to fix. Given that the producer op it's a fully parallel elementwise op, it should be easy to move the reshape before the producer for these cases?

  %9 = flow.dispatch.workgroups(%cst, %7, %8) : (tensor<71x4672xf16>, tensor<71x64x4672xi8>, tensor<71x4672xi8>) -> tensor<71x64x4672xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<71x4672xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<71x64x4672xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<71x4672xi8>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<71x64x4672xf32>>) {
    %2135 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [71, 4672], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<71x4672xf16>> -> tensor<71x4672xf16>
    %2136 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [71, 64, 4672], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<71x64x4672xi8>> -> tensor<71x64x4672xi8>
    %2137 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [71, 4672], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<71x4672xi8>> -> tensor<71x4672xi8>
    %2138 = tensor.empty() : tensor<71x64x4672xf32>
    %2139 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2135, %2136, %2137 : tensor<71x4672xf16>, tensor<71x64x4672xi8>, tensor<71x4672xi8>) outs(%2138 : tensor<71x64x4672xf32>) {
    ^bb0(%in: f16, %in_322: i8, %in_323: i8, %out: f32):
      %2140 = arith.subi %in_322, %in_323 : i8
      %2141 = arith.sitofp %2140 : i8 to f16
      %2142 = arith.mulf %in, %2141 : f16
      %2143 = arith.extf %2142 : f16 to f32
      linalg.yield %2143 : f32
    } -> tensor<71x64x4672xf32>
    flow.dispatch.tensor.store %2139, %arg5, offsets = [0, 0, 0], sizes = [71, 64, 4672], strides = [1, 1, 1] : tensor<71x64x4672xf32> -> !flow.dispatch.tensor<writeonly:tensor<71x64x4672xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }

  %10 = flow.tensor.reshape %9 : tensor<71x64x4672xf32> -> tensor<4544x4672xf32>

  %12 = flow.dispatch.workgroups(%10) : (tensor<4544x4672xf32>) -> tensor<292x4544x16x1xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<4544x4672xf32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<292x4544x16x1xf32>>) {
    %cst_322 = arith.constant 0.000000e+00 : f32
    %2135 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [4544, 4672], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4544x4672xf32>> -> tensor<4544x4672xf32>
    %2136 = tensor.empty() : tensor<292x4544x16x1xf32>
    %pack = tensor.pack %2135 padding_value(%cst_322 : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %2136 : tensor<4544x4672xf32> -> tensor<292x4544x16x1xf32>
    flow.dispatch.tensor.store %pack, %arg3, offsets = [0, 0, 0, 0], sizes = [292, 4544, 16, 1], strides = [1, 1, 1, 1] : tensor<292x4544x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<292x4544x16x1xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }

This is the unpack side:

  %14 = flow.dispatch.workgroups(%13) : (tensor<7x292x16x16xf32>) -> tensor<100x4672xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<7x292x16x16xf32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<100x4672xf32>>) {
    %2135 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [7, 292, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<7x292x16x16xf32>> -> tensor<7x292x16x16xf32>
    %2136 = tensor.empty() : tensor<100x4672xf32>
    %unpack = tensor.unpack %2135 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %2136 : tensor<7x292x16x16xf32> -> tensor<100x4672xf32>
    flow.dispatch.tensor.store %unpack, %arg3, offsets = [0, 0], sizes = [100, 4672], strides = [1, 1] : tensor<100x4672xf32> -> !flow.dispatch.tensor<writeonly:tensor<100x4672xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %15 = flow.tensor.reshape %14 : tensor<100x4672xf32> -> tensor<467200xf32>
  %16 = flow.dispatch.workgroups(%15) : (tensor<467200xf32>) -> tensor<467200xf16> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<467200xf32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<467200xf16>>) {
    %cst_322 = arith.constant 0.000000e+00 : f16
    %2135 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [467200], strides = [1] : !flow.dispatch.tensor<readonly:tensor<467200xf32>> -> tensor<467200xf32>
    %2136 = tensor.empty() : tensor<467200xf16>
    %2137 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%2135 : tensor<467200xf32>) outs(%2136 : tensor<467200xf16>) {
    ^bb0(%in: f32, %out: f16):
      %2138 = arith.truncf %in : f32 to f16
      %2139 = arith.addf %2138, %cst_322 : f16
      linalg.yield %2139 : f16
    } -> tensor<467200xf16>
    flow.dispatch.tensor.store %2137, %arg3, offsets = [0], sizes = [467200], strides = [1] : tensor<467200xf16> -> !flow.dispatch.tensor<writeonly:tensor<467200xf16>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }

MaheshRavishankar commented 1 year ago

This requires little bit more of the input to really see whats happening. Need to track down where these reshapes got introduce to really fix the issue. Do you have the linalg input (or a snippet that shows similar behavior) to triage...

dcaballe commented 1 year ago

Does the IR dump help? This one is OSS so we can share: https://gist.github.com/dcaballe/a4d4f5081391e9399cb56e015ae28bd5

dcaballe commented 1 year ago

This is similar case from the gist, where the op in the middle is a tensor.collapse_dim:

  %12 = flow.dispatch.region -> (tensor<71x64x584x8xf32>) {
    %1308 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst, %expanded, %hoisted_0 : tensor<71x584x8xf16>, tensor<71x64x584x8xi8>, tensor<71x584x8xi8>) outs(%11 : tensor<71x64x584x8xf32>) {
    ^bb0(%in: f16, %in_1260: i8, %in_1261: i8, %out: f32):
      %1309 = arith.subi %in_1260, %in_1261 : i8
      %1310 = arith.sitofp %1309 : i8 to f16
      %1311 = arith.mulf %in, %1310 : f16
      %1312 = arith.extf %1311 : f16 to f32
      linalg.yield %1312 : f32
    } -> tensor<71x64x584x8xf32>
    flow.return %1308 : tensor<71x64x584x8xf32>
  }

  %collapsed_333 = tensor.collapse_shape %12 [[0, 1], [2, 3]] : tensor<71x64x584x8xf32> into tensor<4544x4672xf32>

  %16 = flow.dispatch.region -> (tensor<292x4544x16x1xf32>) {                                                                                                                        
    %pack = tensor.pack %collapsed_333 padding_value(%cst_128 : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %15 : tensor<4544x4672xf32> -> tensor<292x4544x16x1xf32>                                                                                                                                                                 
    flow.return %pack : tensor<292x4544x16x1xf32>                                                                                                                                    
  }

There should be the symmetrical one with tensor.expand_dim, I guess.

hanhanW commented 1 year ago

Does the IR dump help? This one is OSS so we can share: https://gist.github.com/dcaballe/a4d4f5081391e9399cb56e015ae28bd5

Hey @dcaballe, the gist does not include the dump.

My guess is that the reshape ops are pushed down because of elementwise fusion.

dcaballe commented 1 year ago

Oops, not sure what happened: https://gist.github.com/dcaballe/e196b70a4dbce30c8ef951bd96b84987 I had to trim part of the dump because it's too large for gist so I'm only including the dump after the const eval jitting. Let me know if that's not enough. Thanks!

hanhanW commented 1 year ago

Packing on reshape ops is part of input, we will need to teach IREE to either (1) fuse them into a single dispatch, or (2) bubble up the reshape ops across element-wise generic ops.

We can not simply go with (1) because reshape ops would stop TileAndFuse. We will still need (2) which can then fold reshape ops into flow.dispatch.tensor.load ops.

In any case, we need to support swapping an element-wise generic op and a reshape op when

(1) The producer of pack op is a reshape op. (2) The producer of reshape op is an element-wise operation. (3) The result of element-wise operation is only used by reshape ops.

If all the condition meets, we can bubble up the generic op and things will get fused together. This can also help the number of dispatch issues in data-tiling.

For unpack op, we will need something similar. Though the consumer op (which is reshape) is generated by FusionOfTensorOps.

This is a valid pattern we want for data-tiling path, we should have someone to implement the pass and use it in IREE. I'm happy to go through more details if someone is interested in it.

MaheshRavishankar commented 1 year ago

Packing on reshape ops is part of input, we will need to teach IREE to either (1) fuse them into a single dispatch, or (2) bubble up the reshape ops across element-wise generic ops.

We can not simply go with (1) because reshape ops would stop TileAndFuse. We will still need (2) which can then fold reshape ops into flow.dispatch.tensor.load ops.

In any case, we need to support swapping an element-wise generic op and a reshape op when

(1) The producer of pack op is a reshape op. (2) The producer of reshape op is an element-wise operation. (3) The result of element-wise operation is only used by reshape ops.

If all the condition meets, we can bubble up the generic op and things will get fused together. This can also help the number of dispatch issues in data-tiling.

For unpack op, we will need something similar. Though the consumer op (which is reshape) is generated by FusionOfTensorOps.

This is a valid pattern we want for data-tiling path, we should have someone to implement the pass and use it in IREE. I'm happy to go through more details if someone is interested in it.

Lets not over-index on the reshape ops... The reshape ops already get bubbled up/pushed down to enhance fusion... I thin there might be a "small" issue where reshapes on "inputs" gets pushed down to. We should probably just prevent that. THis is too much in details though. Nothing I see here seems to account for 10X slow down....

hanhanW commented 1 year ago

Do we have priority feature for issues today? I'd suggest to de-prioritize this to P2. Others are more critical to me.

MaheshRavishankar commented 1 year ago

Do we have priority feature for issues today? I'd suggest to de-prioritize this to P2. Others are more critical to me.

I agree... the reshape here is not the issue. We need profiles to see why there is a 10X slowdown here.

NatashaKnk commented 1 year ago

Is there a chance that 15273 is causing it for a model that has a lot of vector operations? It might not apply to this particular IR, but the collapsing/expanding would block fusion in those cases as well.

If that's the case, we cannot bubble the collapse/expand up or down the stack, since the shape is crucial to the packing/unpacking as far as I understand. I'm not an expert in this area, so I'm not sure how else we can approahch this

hanhanW commented 1 year ago

The https://github.com/openxla/iree/pull/15273 can cause the issue, but that's not the case in this model. The pack ops in the dump are not packing narrow matrix.

dcaballe commented 1 year ago

This issue is one of the main ones that are causing the performance regression so I think it should still be p1. Hanhan and I identified to cases related to this PR: 1) reshape ops coming from ExpandVector pass, 2) reshape ops coming from the framework itself (the test provided above).

For 1) we know that this is coming from ExpandVector and the collapse/expand ops generated are trivial (e.g., 1xK -> K, K -> 1xK). I think we should look at how to support this trivial cases in fusion, if possible. We can open a separate issue, if you think it's more appropriate. Also, perhaps Natasha can provide a small repro for this case?

For 2), we are trying to understand where these reshapes are coming from to see what can be done.

When I reported this issue, we only had the 2) case and the regression was up to 2x.

NatashaKnk commented 1 year ago

Adding @bjacob since he might have some input here, and @pzread since he has been helping with this internally as well. I'm suspecting the 10x slowdown is caused by a specific layout of the internal model, so this might be higher priority internally. Either way, I'm happy to pick it up if I can get some guidance.

Here's a StableHLO code snippet followed by the dispatch regions. %5 (in the output IR) is the problematic reshape. Just to summarize the approaches discussed here and internally for us:

The simple approaches:

The "bubbling up" method for trivial reshapes - since pack/unpack are not elementwise ops, they serve as "blockers" for the bubbling of reshapes to the top and bottom. However, is there a way to enforce this for trivial reshapes that just add a dimension of size 1?
Fusing the reshape into the flow.dispatch.tensor.load op - Is this feasable to do? I'm not familiar with this area but happy to do it if I can get a quick summary of the approach and some pointers.

The more involved approaches:

Modifying the pack/unpack ops to be able to take in a 1D vector and back it into 4 dimension, for example with a tile size [2, 4] and a vector of size 12 that's going to be vecmat-ed, we would pack it directly into 1x3x1x4 instead of first expanding it into 1x12 and then packing. This approach would require some adjustments to how pack/unpack work, and it seems complicated enough as-is, but it seems possible in theory.
Introduce an alternative to mmt4d that only does tiling accross one dimension as opposed to both. This solution seems to have a lot of overhead, but seems preferable to No3 if we can't get No1 or No2 to work.

If there's an agreement on the best path forward, I'm happy to implement it. Also happy to set up a time to meet if we need to.

hanhanW commented 1 year ago

I think we should fix https://github.com/openxla/iree/issues/15349 first. You will get back to that one even if you can fuse elementwise-reshape-pack or elementwise-pack. I don't have an access to the model, so I can only advice very few things.

I'm suspecting the 10x slowdown is caused by a specific layout of the internal model, so this might be higher priority internally. Either way, I'm happy to pick it up if I can get some guidance.

Here's a StableHLO code snippet followed by the dispatch regions. %5 (in the output IR) is the problematic reshape. Just to summarize the approaches discussed here and internally for us:

The dump looks like a wrong one to me. The dump does not match what's described in the issue. %5 is not blocking the fusion. There are two pack ops. One is already fused with its producer, the other is applied on inputs.

The simple approaches:

The "bubbling up" method for trivial reshapes - since pack/unpack are not elementwise ops, they serve as "blockers" for the bubbling of reshapes to the top and bottom. However, is there a way to enforce this for trivial reshapes that just add a dimension of size 1?

You need to implement the pattern and see how it helps your case.

Fusing the reshape into the flow.dispatch.tensor.load op - Is this feasable to do? I'm not familiar with this area but happy to do it if I can get a quick summary of the approach and some pointers.

If reshape is applied on inputs, we don't pull it into the same dispatch. It is not needed. We can just leave it at flow and replace it with flow.reshape op. It is needed only if

We form elementwise + reshape + pack into the same dispatch
Bubble up reshape to inputs. Then we want to fold it into flow.dispatch.tensor.load

We don't want this for many reasons...

The more involved approaches:

Modifying the pack/unpack ops to be able to take in a 1D vector and back it into 4 dimension, for example with a tile size [2, 4] and a vector of size 12 that's going to be vecmat-ed, we would pack it directly into 1x3x1x4 instead of first expanding it into 1x12 and then packing. This approach would require some adjustments to how pack/unpack work, and it seems complicated enough as-is, but it seems possible in theory.

Yes, this is something we can try. You need to implement the pattern and see how it helps your case.

Introduce an alternative to mmt4d that only does tiling accross one dimension as opposed to both. This solution seems to have a lot of overhead, but seems preferable to No3 if we can't get No1 or No2 to work.

If there's an agreement on the best path forward, I'm happy to implement it. Also happy to set up a time to meet if we need to.

Moving reshape ops across generic ops is tricky because fusion logic involves. Moving pack ops around is easier to integrate. If you think this can address your issue, feel free to implement it and we can try integrate it into IREE.

MaheshRavishankar commented 1 year ago

@NatashaKnk I have only seen huge IR diffs of the issue.... Hanhan provided some guidance... but if you can give a smaller description of the problem (small IR repro), I can provide some more feedback.

dcaballe commented 1 year ago

Let me try to provide a bit of context as I think we have been discussing the details of this issue across different related issues. There are two sub-problems we have identified:

The simple one: collapse/expand shape ops are adding/removing unit dims. This problem is particularly happening when applying DT to vecmat/matvec ops and is largely contributing to the regression. IMO, this should be the first to be solved.
The tricky one: collapse/expand shape ops are reshaping the tensor arbitrarily. This problem was there when the regression was only 2X.

We can create separate issues if that would help, now that we know that the fix for 1. and 2. may have a different solution.

In terms por priorities, I would go with sub-problem 1 first, then #15349 and then sub-problem 2.

Fusing the reshape into the flow.dispatch.tensor.load op - Is this feasible to do? I'm not familiar with this area but happy to do it if I can get a quick summary of the approach and some pointers.

I think the solution for sub-problem 1 should be along these lines. These unit extending/collapsing cases should be relatively easy to support in fusion so we could extend the dispatch formation logic to include the reshaping op within the dispatch if the pack op can be fused with the producer across the reshape op. Do you think this is feasible?

The more involved approaches:

Modifying the pack/unpack ops to be able to take in a 1D vector and back it into 4 dimension, for example with a tile size [2, 4] and a vector of size 12 that's going to be vecmat-ed, we would pack it directly into 1x3x1x4 instead of first expanding it into 1x12 and then packing. This approach would require some adjustments to how pack/unpack work, and it seems complicated enough as-is, but it seems possible in theory.

Yes, this is something we can try. You need to implement the pattern and see how it helps your case.

Would this tackle sub-problem 2? If only sub-problem 1, I think the added complexity to pack/unpack (which is already high) may not be worth it. It wouldn't solve the problem for all the cases, either. For example, if we have one of these unit dim reshapes between two ops where neither of them is a pack/unpack op. Those cases, however, would be covered with the fusion approach.

MaheshRavishankar commented 1 year ago

Let me try to provide a bit of context as I think we have been discussing the details of this issue across different related issues. There are two sub-problems we have identified:

The simple one: collapse/expand shape ops are adding/removing unit dims. This problem is particularly happening when applying DT to vecmat/matvec ops and is largely contributing to the regression. IMO, this should be the first to be solved.

The tricky one: collapse/expand shape ops are reshaping the tensor arbitrarily. This problem was there when the regression was only 2X.

We can create separate issues if that would help, now that we know that the fix for 1. and 2. may have a different solution.

In terms por priorities, I would go with sub-problem 1 first, then #15349 and then sub-problem 2.

Fusing the reshape into the flow.dispatch.tensor.load op - Is this feasible to do? I'm not familiar with this area but happy to do it if I can get a quick summary of the approach and some pointers.

I think the solution for sub-problem 1 should be along these lines. These unit extending/collapsing cases should be relatively easy to support in fusion so we could extend the dispatch formation logic to include the reshaping op within the dispatch if the pack op can be fused with the producer across the reshape op. Do you think this is feasible?

Having reshapes in the middle of the dispatch is a bit of a footgun... You have to be able to propagate the reshapes to the edges, otherwise bufferization is going to choke on that. So the fusion on tensors pass is supposed to move the reshapes around to not have to fuse across dispatches... To triage, the focus point should be to see why the fusion pass is not able to move these around.

The more involved approaches: Modifying the pack/unpack ops to be able to take in a 1D vector and back it into 4 dimension, for example with a tile size [2, 4] and a vector of size 12 that's going to be vecmat-ed, we would pack it directly into 1x3x1x4 instead of first expanding it into 1x12 and then packing. This approach would require some adjustments to how pack/unpack work, and it seems complicated enough as-is, but it seems possible in theory.

Yes, this is something we can try. You need to implement the pattern and see how it helps your case.

Would this tackle sub-problem 2? If only sub-problem 1, I think the added complexity to pack/unpack (which is already high) may not be worth it. It wouldn't solve the problem for all the cases, either. For example, if we have one of these unit dim reshapes between two ops where neither of them is a pack/unpack op. Those cases, however, would be covered with the fusion approach.

bjacob commented 12 months ago

Can you retry the above benchmark that showed DT to be a regression, with DT+UK ? I would like to rule out the possibility that some of the performance difference being observed here might be related to mmt4d codegen. @dcaballe @NatashaKnk @MaheshRavishankar @hanhanW . Note: one way to benchmark with UK is to apply https://github.com/openxla/iree/pull/15586 .

iree-org / iree

[CPU] Fuse DT pack/unpack through `tensor.reshape` #15242