Closed stellaraccident closed 2 years ago
Looks like some stuff around select handling; unrealized_conversion_cast sneaks in and doesn't go away:
%338 = arith.cmpi eq, %337, %c0_i32 : i32
%339 = select %338, %323, %331 : !stream.resource<*>
%340 = select %338, %321, %330 : index
%341 = builtin.unrealized_conversion_cast %339, %340 : !stream.resource<*>, index to !stream.resource<*>
Should be pretty easy to fix up.
Note that whatever this model is doing is quite extreme in highlighting our current linalg fusion deficiencies. If we're expecting programs like this we really can't keep kicking the can on that.
%93 = "mhlo.compare"(%arg159, %3) {compare_type = "FLOAT", comparison_direction = "EQ"} : (tensor<1x1x1x3xf32>, tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xi1>
%94 = mhlo.reduce %93, %83 ( {
^bb0(%arg322: tensor<i1>, %arg323: tensor<i1>): // no predecessors
%2359 = mhlo.and %arg322, %arg323 : tensor<i1>
"mhlo.return"(%2359) : (tensor<i1>) -> ()
}) {dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x3xi1>, tensor<i1>) -> tensor<i1>
%95 = "mhlo.not"(%94) : (tensor<i1>) -> tensor<i1>
%96 = "mhlo.convert"(%95) : (tensor<i1>) -> tensor<i32>
%97 = tensor.extract %96[] : tensor<i32>
%98 = arith.cmpi eq, %97, %c0_i32 : i32
%99 = select %98, %arg321, %92 : tensor<1x224x224x3xf32>
->
%5 = flow.dispatch.workgroups[%c3, %c1, %c1](%1) : (tensor<3xf32>) -> tensor<3xi1> =
(%arg1: !flow.dispatch.tensor<readonly:3xf32>, %arg2: !flow.dispatch.tensor<writeonly:3xi1>) {
%cst_1 = arith.constant -1.000000e+00 : f32
%c3_2 = arith.constant 3 : index
%781 = linalg.init_tensor [3] : tensor<3xi1>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%782 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%783 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg3 = %782 to %c3_2 step %783 {
%784 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 3)>(%workgroup_size_0, %arg3)
%785 = flow.dispatch.tensor.load %arg1, offsets = [%arg3], sizes = [%784], strides = [1] : !flow.dispatch.tensor<readonly:3xf32> -> tensor<?xf32>
%786 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 3)>(%workgroup_size_0, %arg3)
%787 = tensor.extract_slice %781[%arg3] [%786] [1] : tensor<3xi1> to tensor<?xi1>
%788 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%785 : tensor<?xf32>) outs(%787 : tensor<?xi1>) {
^bb0(%arg4: f32, %arg5: i1): // no predecessors
%789 = arith.cmpf oeq, %arg4, %cst_1 : f32
linalg.yield %789 : i1
} -> tensor<?xi1>
flow.dispatch.tensor.store %788, %arg2, offsets = [%arg3], sizes = [%786], strides = [1] : tensor<?xi1> -> !flow.dispatch.tensor<writeonly:3xi1>
}
flow.return
}
%6 = flow.dispatch.workgroups[%c1, %c1, %c1](%5) : (tensor<3xi1>) -> tensor<i32> =
(%arg1: !flow.dispatch.tensor<readonly:3xi1>, %arg2: !flow.dispatch.tensor<writeonly:i32>) {
%true = arith.constant true
%781 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3xi1> -> tensor<3xi1>
%782 = linalg.init_tensor [] : tensor<i32>
%783 = linalg.init_tensor [] : tensor<i1>
%784 = linalg.fill(%true, %783) : i1, tensor<i1> -> tensor<i1>
%785 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%781 : tensor<3xi1>) outs(%784 : tensor<i1>) {
^bb0(%arg3: i1, %arg4: i1): // no predecessors
%787 = arith.andi %arg3, %arg4 : i1
linalg.yield %787 : i1
} -> tensor<i1>
%786 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%785 : tensor<i1>) outs(%782 : tensor<i32>) {
^bb0(%arg3: i1, %arg4: i32): // no predecessors
%787 = arith.xori %arg3, %true : i1
%788 = arith.extui %787 : i1 to i32
linalg.yield %788 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %786, %arg2, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
%7 = tensor.extract %6[] : tensor<i32>
%8 = arith.cmpi eq, %7, %c0_i32 : i32
%9 = select %8, %0, %4 : tensor<1x224x224x3xf32>
->
%5 = flow.dispatch @predict_dispatch_1::@predict_dispatch_1[%c3, %c1, %c1](%1) : (tensor<3xf32>) -> tensor<3xi1>
%6 = flow.dispatch @predict_dispatch_2::@predict_dispatch_2[%c1, %c1, %c1](%5) : (tensor<3xi1>) -> tensor<i32>
%7 = flow.tensor.load %6 : tensor<i32>
%8 = arith.cmpi eq, %7, %c0_i32 : i32
%9 = select %8, %0, %4 : tensor<1x224x224x3xf32>
There are 54 of these in the model, meaning 54 command buffers and 54 device->host synchronizations. Detensoring won't help here as the result of the select (which tensor to use) is then consumed by another dispatch region - instead we need to be moving this kind of logic into the dispatch regions so that we keep only pure device->device dependencies. Is this kind of thing you're doing in the model yourself or something that comes from JAX?
device->host readbacks look like this:
An inlining step on flow.dispatch.workgroups would be able to pull in the select on inputs, and then a simple inlining of anything acting on primitives up to a flow.tensor.load could pull in the rest. So we may be able to do this as a cleanup step in this particular use pattern with the goal being to never have a device->host->device flow unless there are ops in that chain that have side effects.
It's a good case to keep in view for generality -- this kind of program (without additional folding) will happen for training flows, but not inference. Kind of a blast from the past... in an entirely different context about the same topic, I remember working with the XLA team ~years ago because this same arithmetic was defeating fusion in XLA and causing excessively large programs (I don't actually know if it ever got fixed -- I suspect not).
Oh for sure - less "don't do that" and more "if you do that we need to handle it better" - so mainly curious if this is a quirk of JAX that we should expect to see in other models (sounds like it) and if so then getting the inlining/readback elision in will be important to do sooner rather than later. I think with that we'll be able to handle this particular thing pretty well though there still will be fusion work to do - looking at what we have in here there should be far fewer dispatches (I feel like I had an issue for this somewhere - basically, we need more than one loop nest per dispatch).
Also, this pattern tends to show up a lot in piecewise training programs (i.e. those that take a different branch on some numerics after a certain point in training, conditioned on anything from a simple condition to some analysis of threshold values). There may be some initial high level simplifications to kick in that break it apart further: most programs I see on this kind of thing basically have a big switch that is constant for an entire step that says "go left vs right". But ML being unrolled/traced so much, that often gets buried and repeated in various forms throughout the program.
Almost all that "inference optimizers" do is fix the conditions driving these switches and apply pretty basic constant folding.
unrealized_conversion_cast cleanup is in https://github.com/google/iree/pull/7814 - it makes this model get further but it fails on layout of the partitions due to some reordering nastiness. I'll take a look on Monday.
Smaller repro here:
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 224)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 3)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map4 = affine_map<(d0, d1, d2) -> (d2)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0)[s0] -> (s0, -d0 + 7)>
#map9 = affine_map<(d0)[s0] -> (s0, -d0 + 64)>
#map10 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map11 = affine_map<(d0)[s0] -> (-d0 + 64, s0)>
#map12 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)>
#map13 = affine_map<(d0, d1, d2, d3) -> (d0)>
#map14 = affine_map<(d0, d1, d2, d3) -> (d3)>
#map15 = affine_map<()[s0, s1] -> (s1 * s0)>
#map16 = affine_map<(d0) -> (d0 + 3)>
#map17 = affine_map<(d0)[s0] -> (s0, -d0 + 112)>
#map18 = affine_map<(d0) -> (d0 * 2)>
#map19 = affine_map<(d0, d1) -> (d0 * 2 + 5, d1 * -2 + 229)>
#map20 = affine_map<(d0)[s0] -> (-d0 + 112, s0)>
#map21 = affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 113)>
#map22 = affine_map<(d0)[s0] -> (s0, -d0 + 56)>
#map23 = affine_map<(d0)[s0] -> (-d0 + 56, s0)>
#map24 = affine_map<(d0)[s0] -> (s0, -d0 + 256)>
#map25 = affine_map<(d0, d1) -> (d0, d1)>
#map26 = affine_map<(d0)[s0] -> (-d0 + 256, s0)>
#map27 = affine_map<(d0, d1) -> (d1, d0)>
#map28 = affine_map<(d0, d1) -> (d0)>
#map29 = affine_map<(d0, d1) -> (d1)>
#map30 = affine_map<(d0)[s0] -> (s0, -d0 + 3136)>
#map31 = affine_map<(d0)[s0] -> (-d0 + 3136, s0)>
#map32 = affine_map<(d0) -> (d0 + 1)>
#map33 = affine_map<(d0, d1) -> (d1 + 2, -d0 + 58)>
module @resnet_inference_model {
util.global private @_variables$0 : !stream.resource<constant>
util.global private @_variables$0__size : index
util.global private @_variables$1 : !stream.resource<constant>
util.global private @_variables$1__size : index
util.global private @_variables$2 : !stream.resource<constant>
util.global private @_variables$2__size : index
util.global private @_variables$3 : !stream.resource<constant>
util.global private @_variables$3__size : index
util.global private @_variables$4 : !stream.resource<constant>
util.global private @_variables$4__size : index
util.global private @_variables$5 : !stream.resource<constant>
util.global private @_variables$5__size : index
util.global private @_variables$6 : !stream.resource<constant>
util.global private @_variables$6__size : index
util.global private @_variables$7 : !stream.resource<constant>
util.global private @_variables$7__size : index
util.global private @_variables$104 : !stream.resource<constant>
util.global private @_variables$104__size : index
util.global private @_variables$105 : !stream.resource<constant>
util.global private @_variables$105__size : index
util.global private @_variables$112 : !stream.resource<constant>
util.global private @_variables$118 : !stream.resource<constant>
util.global private @_variables$124 : !stream.resource<constant>
util.global private @_variables$130 : !stream.resource<constant>
util.global private @_variables$136 : !stream.resource<constant>
util.global private @_variables$424 : !stream.resource<constant>
util.global private @_variables$432 : !stream.resource<constant>
util.global private @_variables$432__size : index
util.global private @_variables$433 : !stream.resource<constant>
util.global private @_variables$433__size : index
util.global private @_variables$434 : !stream.resource<constant>
util.global private @_variables$434__size : index
util.global private @_variables$435 : !stream.resource<constant>
util.global private @_variables$435__size : index
util.global private @_variables$436 : !stream.resource<constant>
util.global private @_variables$436__size : index
util.global private @_variables$437 : !stream.resource<constant>
util.global private @_variables$437__size : index
util.global private @_variables$438 : !stream.resource<constant>
util.global private @_variables$439 : !stream.resource<constant>
util.global private @_variables$439__size : index
util.global private @_variables$440 : !stream.resource<constant>
util.global private @_variables$441 : !stream.resource<constant>
util.global private @_variables$441__size : index
util.global private @_variables$442 : !stream.resource<constant>
util.global private @_variables$442__size : index
util.global private @_variables$443 : !stream.resource<constant>
util.global private @_variables$588 : !stream.resource<constant>
util.global private @_variables$588__size : index
util.global private @_variables$589 : !stream.resource<constant>
util.global private @_variables$589__size : index
util.global private @_variables$590 : !stream.resource<constant>
util.global private @_variables$590__size : index
stream.executable private @predict_dispatch_0 {
stream.executable.export public @predict_dispatch_0
builtin.module {
func @predict_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 5.000000e-01 : f32
%cst_0 = arith.constant -1.270000e+02 : f32
%cst_1 = arith.constant 1.270000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c224 = arith.constant 224 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:224x224x3xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:224x224x3xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%3 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%4 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %3 to %c224 step %4 {
%5 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%6 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %5 to %c224 step %6 {
%7 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%8 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %7 to %c3 step %8 {
%9 = affine.min #map1(%arg3)[%workgroup_size_2]
%10 = affine.min #map1(%arg4)[%workgroup_size_1]
%11 = affine.min #map2(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %0, offsets = [%arg3, %arg4, %arg5], sizes = [%9, %10, %11], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:224x224x3xf32> -> tensor<?x?x?xf32>
%13 = flow.dispatch.tensor.load %1, offsets = [%arg5], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:3xf32> -> tensor<?xf32>
%14 = linalg.init_tensor [%9, %10, %11] : tensor<?x?x?xf32>
%15 = linalg.generic {indexing_maps = [#map3, #map4, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12, %13 : tensor<?x?x?xf32>, tensor<?xf32>) outs(%14 : tensor<?x?x?xf32>) {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32): // no predecessors
%16 = arith.addf %arg7, %cst_2 : f32
%17 = arith.divf %cst_1, %16 : f32
%18 = arith.mulf %arg6, %17 : f32
%19 = arith.maxf %18, %cst_0 : f32
%20 = arith.minf %19, %cst_1 : f32
%21 = arith.addf %20, %cst : f32
%22 = math.floor %21 : f32
%23 = arith.divf %22, %17 : f32
linalg.yield %23 : f32
} -> tensor<?x?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [%arg3, %arg4, %arg5], sizes = [%9, %10, %11], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:224x224x3xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_1 {
stream.executable.export public @predict_dispatch_1
builtin.module {
func @predict_dispatch_1(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant -1.000000e+00 : f32
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3xi1>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c3 step %3 {
%4 = affine.min #map2(%arg2)[%workgroup_size_0]
%5 = flow.dispatch.tensor.load %0, offsets = [%arg2], sizes = [%4], strides = [1] : !flow.dispatch.tensor<readonly:3xf32> -> tensor<?xf32>
%6 = linalg.init_tensor [%4] : tensor<?xi1>
%7 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%5 : tensor<?xf32>) outs(%6 : tensor<?xi1>) {
^bb0(%arg3: f32, %arg4: i1): // no predecessors
%8 = arith.cmpf oeq, %arg3, %cst : f32
linalg.yield %8 : i1
} -> tensor<?xi1>
flow.dispatch.tensor.store %7, %1, offsets = [%arg2], sizes = [%4], strides = [1] : tensor<?xi1> -> !flow.dispatch.tensor<writeonly:3xi1>
}
return
}
}
}
stream.executable private @predict_dispatch_2 {
stream.executable.export public @predict_dispatch_2
builtin.module {
func @predict_dispatch_2(%arg0: !stream.binding, %arg1: !stream.binding) {
%true = arith.constant true
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3xi1>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:i32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3xi1> -> tensor<3xi1>
%3 = linalg.init_tensor [] : tensor<i32>
%4 = linalg.init_tensor [] : tensor<i1>
%5 = linalg.fill(%true, %4) : i1, tensor<i1> -> tensor<i1>
%6 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%2 : tensor<3xi1>) outs(%5 : tensor<i1>) {
^bb0(%arg2: i1, %arg3: i1): // no predecessors
%8 = arith.andi %arg2, %arg3 : i1
linalg.yield %8 : i1
} -> tensor<i1>
%7 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = []} ins(%6 : tensor<i1>) outs(%3 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32): // no predecessors
%8 = arith.xori %arg2, %true : i1
%9 = arith.extui %8 : i1 to i32
linalg.yield %9 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %7, %1, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
return
}
}
}
stream.executable private @predict_dispatch_3 {
stream.executable.export public @predict_dispatch_3
builtin.module {
func @predict_dispatch_3(%arg0: !stream.binding, %arg1: !stream.binding) {
%c7 = arith.constant 7 : index
%c3 = arith.constant 3 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:7x7x3x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:7x7x3x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%2 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%3 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg2 = %2 to %c7 step %3 {
%4 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%5 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %4 to %c3 step %5 {
%6 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%7 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %6 to %c64 step %7 {
%8 = affine.min #map8(%arg2)[%workgroup_size_2]
%9 = affine.min #map2(%arg3)[%workgroup_size_1]
%10 = affine.min #map9(%arg4)[%workgroup_size_0]
%11 = flow.dispatch.tensor.load %0, offsets = [0, %arg2, %arg3, %arg4], sizes = [7, %8, %9, %10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:7x7x3x64xf32> -> tensor<7x?x?x?xf32>
%12 = linalg.init_tensor [7, %8, %9, %10] : tensor<7x?x?x?xf32>
%13 = linalg.generic {indexing_maps = [#map10, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<7x?x?x?xf32>) outs(%12 : tensor<7x?x?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%14 = math.abs %arg5 : f32
linalg.yield %14 : f32
} -> tensor<7x?x?x?xf32>
flow.dispatch.tensor.store %13, %1, offsets = [0, %arg2, %arg3, %arg4], sizes = [7, %8, %9, %10], strides = [1, 1, 1, 1] : tensor<7x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:7x7x3x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_4 {
stream.executable.export public @predict_dispatch_4
builtin.module {
func @predict_dispatch_4(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant 0xFF800000 : f32
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:7x7x3x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c64 step %3 {
%4 = affine.min #map9(%arg2)[%workgroup_size_0]
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, %arg2], sizes = [7, 7, 3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:7x7x3x64xf32> -> tensor<7x7x3x?xf32>
%6 = affine.min #map11(%arg2)[%workgroup_size_0]
%7 = linalg.init_tensor [%6] : tensor<?xf32>
%8 = linalg.fill(%cst, %7) : f32, tensor<?xf32> -> tensor<?xf32>
%9 = linalg.generic {indexing_maps = [#map12, #map13], iterator_types = ["parallel", "reduction", "reduction", "reduction"]} ins(%5 : tensor<7x7x3x?xf32>) outs(%8 : tensor<?xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%10 = arith.maxf %arg3, %arg4 : f32
linalg.yield %10 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %9, %1, offsets = [%arg2], sizes = [%4], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
}
return
}
}
}
stream.executable private @predict_dispatch_5 {
stream.executable.export public @predict_dispatch_5
builtin.module {
func @predict_dispatch_5(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 5.000000e-01 : f32
%cst_0 = arith.constant -1.270000e+02 : f32
%cst_1 = arith.constant 1.270000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c7 = arith.constant 7 : index
%c3 = arith.constant 3 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:7x7x3x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:7x7x3x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%3 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%4 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %3 to %c7 step %4 {
%5 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%6 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %5 to %c3 step %6 {
%7 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%8 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %7 to %c64 step %8 {
%9 = affine.min #map8(%arg3)[%workgroup_size_2]
%10 = affine.min #map2(%arg4)[%workgroup_size_1]
%11 = affine.min #map9(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %0, offsets = [0, %arg3, %arg4, %arg5], sizes = [7, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:7x7x3x64xf32> -> tensor<7x?x?x?xf32>
%13 = flow.dispatch.tensor.load %1, offsets = [%arg5], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%14 = linalg.init_tensor [7, %9, %10, %11] : tensor<7x?x?x?xf32>
%15 = linalg.generic {indexing_maps = [#map10, #map14, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12, %13 : tensor<7x?x?x?xf32>, tensor<?xf32>) outs(%14 : tensor<7x?x?x?xf32>) {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32): // no predecessors
%16 = arith.addf %arg7, %cst_2 : f32
%17 = arith.divf %cst_1, %16 : f32
%18 = arith.mulf %arg6, %17 : f32
%19 = arith.maxf %18, %cst_0 : f32
%20 = arith.minf %19, %cst_1 : f32
%21 = arith.addf %20, %cst : f32
%22 = math.floor %21 : f32
%23 = arith.divf %22, %17 : f32
linalg.yield %23 : f32
} -> tensor<7x?x?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [0, %arg3, %arg4, %arg5], sizes = [7, %9, %10, %11], strides = [1, 1, 1, 1] : tensor<7x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:7x7x3x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_6 {
stream.executable.export public @predict_dispatch_6
builtin.module {
func @predict_dispatch_6(%arg0: !stream.binding, %arg1: !stream.binding) {
%c224 = arith.constant 224 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:1x230x230x3xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%2 = affine.apply #map15()[%workgroup_size_2, %workgroup_id_2]
%3 = affine.apply #map15()[%workgroup_size_2, %workgroup_count_2]
scf.for %arg2 = %2 to %c224 step %3 {
%4 = affine.min #map1(%arg2)[%workgroup_size_2]
%5 = affine.apply #map15()[%workgroup_size_1, %workgroup_id_1]
%6 = affine.apply #map15()[%workgroup_size_1, %workgroup_count_1]
scf.for %arg3 = %5 to %c224 step %6 {
%7 = affine.min #map1(%arg3)[%workgroup_size_1]
%8 = affine.apply #map15()[%workgroup_size_0, %workgroup_id_0]
%9 = affine.apply #map15()[%workgroup_size_0, %workgroup_count_0]
scf.for %arg4 = %8 to %c3 step %9 {
%10 = affine.min #map2(%arg4)[%workgroup_size_0]
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg2, %arg3, %arg4], sizes = [1, %4, %7, %10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x?x?x?xf32>
%12 = affine.apply #map16(%arg2)
%13 = affine.apply #map16(%arg3)
flow.dispatch.tensor.store %11, %0, offsets = [0, %12, %13, %arg4], sizes = [1, %4, %7, %10], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<readwrite:1x230x230x3xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_7 {
stream.executable.export public @predict_dispatch_7
builtin.module {
func @predict_dispatch_7(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: !stream.binding, %arg5: !stream.binding, %arg6: !stream.binding) {
%cst = arith.constant 9.99999974E-6 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%c112 = arith.constant 112 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x230x230x3xf32>
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:7x7x3x64xf32>
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:1x112x112x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%7 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%8 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg7 = %7 to %c112 step %8 {
%9 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%10 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg8 = %9 to %c112 step %10 {
%11 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%12 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg9 = %11 to %c64 step %12 {
%13 = affine.min #map17(%arg7)[%workgroup_size_2]
%14 = affine.min #map17(%arg8)[%workgroup_size_1]
%15 = affine.min #map9(%arg9)[%workgroup_size_0]
%16 = flow.dispatch.tensor.load %0, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%17 = flow.dispatch.tensor.load %1, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%18 = flow.dispatch.tensor.load %2, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%19 = flow.dispatch.tensor.load %3, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%20 = linalg.init_tensor [1, %13, %14, %15] : tensor<1x?x?x?xf32>
%21 = affine.apply #map18(%arg7)
%22 = affine.min #map19(%13, %arg7)
%23 = affine.apply #map18(%arg8)
%24 = affine.min #map19(%14, %arg8)
%25 = flow.dispatch.tensor.load %4, offsets = [0, %21, %23, 0], sizes = [1, %22, %24, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x230x230x3xf32> -> tensor<1x?x?x3xf32>
%26 = affine.min #map11(%arg9)[%workgroup_size_0]
%27 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0, %arg9], sizes = [7, 7, 3, %26], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:7x7x3x64xf32> -> tensor<7x7x3x?xf32>
%28 = affine.min #map20(%arg7)[%workgroup_size_2]
%29 = affine.min #map20(%arg8)[%workgroup_size_1]
%30 = linalg.init_tensor [1, %28, %29, %26] : tensor<1x?x?x?xf32>
%31 = linalg.fill(%cst_0, %30) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%32 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%25, %27 : tensor<1x?x?x3xf32>, tensor<7x7x3x?xf32>) outs(%31 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%33 = linalg.generic {indexing_maps = [#map10, #map14, #map14, #map14, #map14, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%32, %16, %17, %18, %19 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%20 : tensor<1x?x?x?xf32>) {
^bb0(%arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32): // no predecessors
%34 = arith.addf %arg12, %cst : f32
%35 = math.rsqrt %34 : f32
%36 = arith.mulf %35, %arg13 : f32
%37 = arith.subf %arg10, %arg11 : f32
%38 = arith.mulf %37, %36 : f32
%39 = arith.addf %38, %arg14 : f32
%40 = arith.maxf %39, %cst_0 : f32
linalg.yield %40 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %33, %6, offsets = [0, %arg7, %arg8, %arg9], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_8 {
stream.executable.export public @predict_dispatch_8
builtin.module {
func @predict_dispatch_8(%arg0: !stream.binding, %arg1: !stream.binding) {
%c112 = arith.constant 112 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:1x113x113x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x112x112x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%2 = affine.apply #map15()[%workgroup_size_2, %workgroup_id_2]
%3 = affine.apply #map15()[%workgroup_size_2, %workgroup_count_2]
scf.for %arg2 = %2 to %c112 step %3 {
%4 = affine.min #map17(%arg2)[%workgroup_size_2]
%5 = affine.apply #map15()[%workgroup_size_1, %workgroup_id_1]
%6 = affine.apply #map15()[%workgroup_size_1, %workgroup_count_1]
scf.for %arg3 = %5 to %c112 step %6 {
%7 = affine.min #map17(%arg3)[%workgroup_size_1]
%8 = affine.apply #map15()[%workgroup_size_0, %workgroup_id_0]
%9 = affine.apply #map15()[%workgroup_size_0, %workgroup_count_0]
scf.for %arg4 = %8 to %c64 step %9 {
%10 = affine.min #map9(%arg4)[%workgroup_size_0]
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg2, %arg3, %arg4], sizes = [1, %4, %7, %10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x64xf32> -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %11, %0, offsets = [0, %arg2, %arg3, %arg4], sizes = [1, %4, %7, %10], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<readwrite:1x113x113x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_9 {
stream.executable.export public @predict_dispatch_9
builtin.module {
func @predict_dispatch_9(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant 0xFF800000 : f32
%c56 = arith.constant 56 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x113x113x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
%2 = linalg.init_tensor [3, 3] : tensor<3x3xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%3 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%4 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg2 = %3 to %c56 step %4 {
%5 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%6 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %5 to %c56 step %6 {
%7 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%8 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %7 to %c64 step %8 {
%9 = affine.apply #map18(%arg2)
%10 = affine.min #map21(%arg2)[%workgroup_size_2]
%11 = affine.apply #map18(%arg3)
%12 = affine.min #map21(%arg3)[%workgroup_size_1]
%13 = affine.min #map9(%arg4)[%workgroup_size_0]
%14 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, %arg4], sizes = [1, %10, %12, %13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x113x113x64xf32> -> tensor<1x?x?x?xf32>
%15 = affine.min #map22(%arg2)[%workgroup_size_2]
%16 = affine.min #map22(%arg3)[%workgroup_size_1]
%17 = affine.min #map23(%arg2)[%workgroup_size_2]
%18 = affine.min #map23(%arg3)[%workgroup_size_1]
%19 = affine.min #map11(%arg4)[%workgroup_size_0]
%20 = linalg.init_tensor [1, %17, %18, %19] : tensor<1x?x?x?xf32>
%21 = linalg.fill(%cst, %20) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%22 = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%14, %2 : tensor<1x?x?x?xf32>, tensor<3x3xf32>) outs(%21 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %22, %1, offsets = [0, %arg2, %arg3, %arg4], sizes = [1, %15, %16, %13], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_10 {
stream.executable.export public @predict_dispatch_10
builtin.module {
func @predict_dispatch_10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 2.550000e+02 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant 2.560000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c56 = arith.constant 56 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x56x56x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%3 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%4 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %3 to %c56 step %4 {
%5 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%6 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %5 to %c56 step %6 {
%7 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%8 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %7 to %c64 step %8 {
%9 = affine.min #map22(%arg3)[%workgroup_size_2]
%10 = affine.min #map22(%arg4)[%workgroup_size_1]
%11 = affine.min #map9(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %0, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x64xf32> -> tensor<1x?x?x?xf32>
%13 = flow.dispatch.tensor.load %1, offsets = [%arg5], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%14 = linalg.init_tensor [1, %9, %10, %11] : tensor<1x?x?x?xf32>
%15 = linalg.generic {indexing_maps = [#map10, #map14, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12, %13 : tensor<1x?x?x?xf32>, tensor<?xf32>) outs(%14 : tensor<1x?x?x?xf32>) {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32): // no predecessors
%16 = arith.addf %arg7, %cst_2 : f32
%17 = arith.divf %cst_1, %16 : f32
%18 = arith.mulf %arg6, %17 : f32
%19 = math.floor %18 : f32
%20 = arith.maxf %19, %cst_0 : f32
%21 = arith.minf %20, %cst : f32
%22 = arith.divf %21, %17 : f32
linalg.yield %22 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %9, %10, %11], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_11 {
stream.executable.export public @predict_dispatch_11
builtin.module {
func @predict_dispatch_11(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant -1.000000e+00 : f32
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64xi1>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c64 step %3 {
%4 = affine.min #map9(%arg2)[%workgroup_size_0]
%5 = flow.dispatch.tensor.load %0, offsets = [%arg2], sizes = [%4], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%6 = linalg.init_tensor [%4] : tensor<?xi1>
%7 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%5 : tensor<?xf32>) outs(%6 : tensor<?xi1>) {
^bb0(%arg3: f32, %arg4: i1): // no predecessors
%8 = arith.cmpf oeq, %arg3, %cst : f32
linalg.yield %8 : i1
} -> tensor<?xi1>
flow.dispatch.tensor.store %7, %1, offsets = [%arg2], sizes = [%4], strides = [1] : tensor<?xi1> -> !flow.dispatch.tensor<writeonly:64xi1>
}
return
}
}
}
stream.executable private @predict_dispatch_12 {
stream.executable.export public @predict_dispatch_12
builtin.module {
func @predict_dispatch_12(%arg0: !stream.binding, %arg1: !stream.binding) {
%true = arith.constant true
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xi1>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:i32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:64xi1> -> tensor<64xi1>
%3 = linalg.init_tensor [] : tensor<i32>
%4 = linalg.init_tensor [] : tensor<i1>
%5 = linalg.fill(%true, %4) : i1, tensor<i1> -> tensor<i1>
%6 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%2 : tensor<64xi1>) outs(%5 : tensor<i1>) {
^bb0(%arg2: i1, %arg3: i1): // no predecessors
%8 = arith.andi %arg2, %arg3 : i1
linalg.yield %8 : i1
} -> tensor<i1>
%7 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = []} ins(%6 : tensor<i1>) outs(%3 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32): // no predecessors
%8 = arith.xori %arg2, %true : i1
%9 = arith.extui %8 : i1 to i32
linalg.yield %9 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %7, %1, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
return
}
}
}
stream.executable private @predict_dispatch_13 {
stream.executable.export public @predict_dispatch_13
builtin.module {
func @predict_dispatch_13(%arg0: !stream.binding, %arg1: !stream.binding) {
%c64 = arith.constant 64 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x256xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64x256xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg2 = %2 to %c64 step %3 {
%4 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg3 = %4 to %c256 step %5 {
%6 = affine.min #map9(%arg2)[%workgroup_size_1]
%7 = affine.min #map24(%arg3)[%workgroup_size_0]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg2, %arg3], sizes = [%6, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x256xf32> -> tensor<?x?xf32>
%9 = linalg.init_tensor [%6, %7] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map25, #map25], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) {
^bb0(%arg4: f32, %arg5: f32): // no predecessors
%11 = math.abs %arg4 : f32
linalg.yield %11 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %1, offsets = [%arg2, %arg3], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:64x256xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_14 {
stream.executable.export public @predict_dispatch_14
builtin.module {
func @predict_dispatch_14(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant 0xFF800000 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x256xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:256xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c256 step %3 {
%4 = affine.min #map24(%arg2)[%workgroup_size_0]
%5 = flow.dispatch.tensor.load %0, offsets = [0, %arg2], sizes = [64, %4], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x256xf32> -> tensor<64x?xf32>
%6 = affine.min #map26(%arg2)[%workgroup_size_0]
%7 = linalg.init_tensor [%6] : tensor<?xf32>
%8 = linalg.fill(%cst, %7) : f32, tensor<?xf32> -> tensor<?xf32>
%9 = linalg.generic {indexing_maps = [#map27, #map28], iterator_types = ["parallel", "reduction"]} ins(%5 : tensor<64x?xf32>) outs(%8 : tensor<?xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%10 = arith.maxf %arg3, %arg4 : f32
linalg.yield %10 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %9, %1, offsets = [%arg2], sizes = [%4], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:256xf32>
}
return
}
}
}
stream.executable private @predict_dispatch_15 {
stream.executable.export public @predict_dispatch_15
builtin.module {
func @predict_dispatch_15(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 5.000000e-01 : f32
%cst_0 = arith.constant -1.270000e+02 : f32
%cst_1 = arith.constant 1.270000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c64 = arith.constant 64 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x256xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64x256xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%4 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %3 to %c64 step %4 {
%5 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%6 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %5 to %c256 step %6 {
%7 = affine.min #map9(%arg3)[%workgroup_size_1]
%8 = affine.min #map24(%arg4)[%workgroup_size_0]
%9 = flow.dispatch.tensor.load %0, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x256xf32> -> tensor<?x?xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [%arg4], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%11 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%12 = linalg.generic {indexing_maps = [#map25, #map29, #map25], iterator_types = ["parallel", "parallel"]} ins(%9, %10 : tensor<?x?xf32>, tensor<?xf32>) outs(%11 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%13 = arith.addf %arg6, %cst_2 : f32
%14 = arith.divf %cst_1, %13 : f32
%15 = arith.mulf %arg5, %14 : f32
%16 = arith.maxf %15, %cst_0 : f32
%17 = arith.minf %16, %cst_1 : f32
%18 = arith.addf %17, %cst : f32
%19 = math.floor %18 : f32
%20 = arith.divf %19, %14 : f32
linalg.yield %20 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %12, %2, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:64x256xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_16 {
stream.executable.export public @predict_dispatch_16
builtin.module {
func @predict_dispatch_16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 5.000000e-01 : f32
%cst_0 = arith.constant -1.270000e+02 : f32
%cst_1 = arith.constant 1.270000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c56 = arith.constant 56 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x56x56x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%3 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%4 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %3 to %c56 step %4 {
%5 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%6 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %5 to %c56 step %6 {
%7 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%8 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %7 to %c64 step %8 {
%9 = affine.min #map22(%arg3)[%workgroup_size_2]
%10 = affine.min #map22(%arg4)[%workgroup_size_1]
%11 = affine.min #map9(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %0, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x64xf32> -> tensor<1x?x?x?xf32>
%13 = flow.dispatch.tensor.load %1, offsets = [%arg5], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%14 = linalg.init_tensor [1, %9, %10, %11] : tensor<1x?x?x?xf32>
%15 = linalg.generic {indexing_maps = [#map10, #map14, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12, %13 : tensor<1x?x?x?xf32>, tensor<?xf32>) outs(%14 : tensor<1x?x?x?xf32>) {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32): // no predecessors
%16 = arith.addf %arg7, %cst_2 : f32
%17 = arith.divf %cst_1, %16 : f32
%18 = arith.mulf %arg6, %17 : f32
%19 = arith.maxf %18, %cst_0 : f32
%20 = arith.minf %19, %cst_1 : f32
%21 = arith.addf %20, %cst : f32
%22 = math.floor %21 : f32
%23 = arith.divf %22, %17 : f32
linalg.yield %23 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %9, %10, %11], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_19 {
stream.executable.export public @predict_dispatch_19
builtin.module {
func @predict_dispatch_19(%arg0: !stream.binding, %arg1: !stream.binding) {
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg2 = %2 to %c64 step %3 {
%4 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg3 = %4 to %c64 step %5 {
%6 = affine.min #map9(%arg2)[%workgroup_size_1]
%7 = affine.min #map9(%arg3)[%workgroup_size_0]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg2, %arg3], sizes = [%6, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x64xf32> -> tensor<?x?xf32>
%9 = linalg.init_tensor [%6, %7] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map25, #map25], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) {
^bb0(%arg4: f32, %arg5: f32): // no predecessors
%11 = math.abs %arg4 : f32
linalg.yield %11 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %1, offsets = [%arg2, %arg3], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:64x64xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_20 {
stream.executable.export public @predict_dispatch_20
builtin.module {
func @predict_dispatch_20(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant 0xFF800000 : f32
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c64 step %3 {
%4 = affine.min #map9(%arg2)[%workgroup_size_0]
%5 = flow.dispatch.tensor.load %0, offsets = [0, %arg2], sizes = [64, %4], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x64xf32> -> tensor<64x?xf32>
%6 = affine.min #map11(%arg2)[%workgroup_size_0]
%7 = linalg.init_tensor [%6] : tensor<?xf32>
%8 = linalg.fill(%cst, %7) : f32, tensor<?xf32> -> tensor<?xf32>
%9 = linalg.generic {indexing_maps = [#map27, #map28], iterator_types = ["parallel", "reduction"]} ins(%5 : tensor<64x?xf32>) outs(%8 : tensor<?xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%10 = arith.maxf %arg3, %arg4 : f32
linalg.yield %10 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %9, %1, offsets = [%arg2], sizes = [%4], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
}
return
}
}
}
stream.executable private @predict_dispatch_21 {
stream.executable.export public @predict_dispatch_21
builtin.module {
func @predict_dispatch_21(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 5.000000e-01 : f32
%cst_0 = arith.constant -1.270000e+02 : f32
%cst_1 = arith.constant 1.270000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%4 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %3 to %c64 step %4 {
%5 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%6 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %5 to %c64 step %6 {
%7 = affine.min #map9(%arg3)[%workgroup_size_1]
%8 = affine.min #map9(%arg4)[%workgroup_size_0]
%9 = flow.dispatch.tensor.load %0, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x64xf32> -> tensor<?x?xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [%arg4], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%11 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%12 = linalg.generic {indexing_maps = [#map25, #map29, #map25], iterator_types = ["parallel", "parallel"]} ins(%9, %10 : tensor<?x?xf32>, tensor<?xf32>) outs(%11 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%13 = arith.addf %arg6, %cst_2 : f32
%14 = arith.divf %cst_1, %13 : f32
%15 = arith.mulf %arg5, %14 : f32
%16 = arith.maxf %15, %cst_0 : f32
%17 = arith.minf %16, %cst_1 : f32
%18 = arith.addf %17, %cst : f32
%19 = math.floor %18 : f32
%20 = arith.divf %19, %14 : f32
linalg.yield %20 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %12, %2, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:64x64xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_22 {
stream.executable.export public @predict_dispatch_22
builtin.module {
func @predict_dispatch_22(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: !stream.binding, %arg5: !stream.binding, %arg6: !stream.binding) {
%cst = arith.constant 9.99999974E-6 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%c3136 = arith.constant 3136 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3136x64xf32>
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x64xf32>
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3136x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg7 = %7 to %c3136 step %8 {
%9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg8 = %9 to %c64 step %10 {
%11 = affine.min #map9(%arg8)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %0, offsets = [%arg8], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%13 = flow.dispatch.tensor.load %1, offsets = [%arg8], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%14 = flow.dispatch.tensor.load %2, offsets = [%arg8], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%15 = flow.dispatch.tensor.load %3, offsets = [%arg8], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%16 = affine.min #map30(%arg7)[%workgroup_size_1]
%17 = linalg.init_tensor [%16, %11] : tensor<?x?xf32>
%18 = affine.min #map31(%arg7)[%workgroup_size_1]
%19 = flow.dispatch.tensor.load %4, offsets = [%arg7, 0], sizes = [%18, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x64xf32> -> tensor<?x64xf32>
%20 = affine.min #map11(%arg8)[%workgroup_size_0]
%21 = flow.dispatch.tensor.load %5, offsets = [0, %arg8], sizes = [64, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x64xf32> -> tensor<64x?xf32>
%22 = linalg.init_tensor [%18, %20] : tensor<?x?xf32>
%23 = linalg.fill(%cst_0, %22) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%24 = linalg.matmul ins(%19, %21 : tensor<?x64xf32>, tensor<64x?xf32>) outs(%23 : tensor<?x?xf32>) -> tensor<?x?xf32>
%25 = linalg.generic {indexing_maps = [#map25, #map29, #map29, #map29, #map29, #map25], iterator_types = ["parallel", "parallel"]} ins(%24, %12, %13, %14, %15 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%17 : tensor<?x?xf32>) {
^bb0(%arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32): // no predecessors
%26 = arith.addf %arg11, %cst : f32
%27 = math.rsqrt %26 : f32
%28 = arith.mulf %27, %arg12 : f32
%29 = arith.subf %arg9, %arg10 : f32
%30 = arith.mulf %29, %28 : f32
%31 = arith.addf %30, %arg13 : f32
%32 = arith.maxf %31, %cst_0 : f32
linalg.yield %32 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %25, %6, offsets = [%arg7, %arg8], sizes = [%16, %11], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:3136x64xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_23 {
stream.executable.export public @predict_dispatch_23
builtin.module {
func @predict_dispatch_23(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 2.550000e+02 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant 2.560000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c3136 = arith.constant 3136 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3136x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3136x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%4 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %3 to %c3136 step %4 {
%5 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%6 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %5 to %c64 step %6 {
%7 = affine.min #map30(%arg3)[%workgroup_size_1]
%8 = affine.min #map9(%arg4)[%workgroup_size_0]
%9 = flow.dispatch.tensor.load %0, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x64xf32> -> tensor<?x?xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [%arg4], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%11 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%12 = linalg.generic {indexing_maps = [#map25, #map29, #map25], iterator_types = ["parallel", "parallel"]} ins(%9, %10 : tensor<?x?xf32>, tensor<?xf32>) outs(%11 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%13 = arith.addf %arg6, %cst_2 : f32
%14 = arith.divf %cst_1, %13 : f32
%15 = arith.mulf %arg5, %14 : f32
%16 = math.floor %15 : f32
%17 = arith.maxf %16, %cst_0 : f32
%18 = arith.minf %17, %cst : f32
%19 = arith.divf %18, %14 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %12, %2, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:3136x64xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_26 {
stream.executable.export public @predict_dispatch_26
builtin.module {
func @predict_dispatch_26(%arg0: !stream.binding, %arg1: !stream.binding) {
%c3 = arith.constant 3 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3x3x64x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3x3x64x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%2 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%3 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg2 = %2 to %c3 step %3 {
%4 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%5 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %4 to %c64 step %5 {
%6 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%7 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %6 to %c64 step %7 {
%8 = affine.min #map2(%arg2)[%workgroup_size_2]
%9 = affine.min #map9(%arg3)[%workgroup_size_1]
%10 = affine.min #map9(%arg4)[%workgroup_size_0]
%11 = flow.dispatch.tensor.load %0, offsets = [0, %arg2, %arg3, %arg4], sizes = [3, %8, %9, %10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x64x64xf32> -> tensor<3x?x?x?xf32>
%12 = linalg.init_tensor [3, %8, %9, %10] : tensor<3x?x?x?xf32>
%13 = linalg.generic {indexing_maps = [#map10, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<3x?x?x?xf32>) outs(%12 : tensor<3x?x?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%14 = math.abs %arg5 : f32
linalg.yield %14 : f32
} -> tensor<3x?x?x?xf32>
flow.dispatch.tensor.store %13, %1, offsets = [0, %arg2, %arg3, %arg4], sizes = [3, %8, %9, %10], strides = [1, 1, 1, 1] : tensor<3x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:3x3x64x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_27 {
stream.executable.export public @predict_dispatch_27
builtin.module {
func @predict_dispatch_27(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant 0xFF800000 : f32
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3x3x64x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c64 step %3 {
%4 = affine.min #map9(%arg2)[%workgroup_size_0]
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 64, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x64x64xf32> -> tensor<3x3x64x?xf32>
%6 = affine.min #map11(%arg2)[%workgroup_size_0]
%7 = linalg.init_tensor [%6] : tensor<?xf32>
%8 = linalg.fill(%cst, %7) : f32, tensor<?xf32> -> tensor<?xf32>
%9 = linalg.generic {indexing_maps = [#map12, #map13], iterator_types = ["parallel", "reduction", "reduction", "reduction"]} ins(%5 : tensor<3x3x64x?xf32>) outs(%8 : tensor<?xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%10 = arith.maxf %arg3, %arg4 : f32
linalg.yield %10 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %9, %1, offsets = [%arg2], sizes = [%4], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
}
return
}
}
}
stream.executable private @predict_dispatch_28 {
stream.executable.export public @predict_dispatch_28
builtin.module {
func @predict_dispatch_28(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 5.000000e-01 : f32
%cst_0 = arith.constant -1.270000e+02 : f32
%cst_1 = arith.constant 1.270000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c3 = arith.constant 3 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3x3x64x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3x3x64x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%3 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%4 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %3 to %c3 step %4 {
%5 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%6 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %5 to %c64 step %6 {
%7 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%8 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %7 to %c64 step %8 {
%9 = affine.min #map2(%arg3)[%workgroup_size_2]
%10 = affine.min #map9(%arg4)[%workgroup_size_1]
%11 = affine.min #map9(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %0, offsets = [0, %arg3, %arg4, %arg5], sizes = [3, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x64x64xf32> -> tensor<3x?x?x?xf32>
%13 = flow.dispatch.tensor.load %1, offsets = [%arg5], sizes = [%11], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%14 = linalg.init_tensor [3, %9, %10, %11] : tensor<3x?x?x?xf32>
%15 = linalg.generic {indexing_maps = [#map10, #map14, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12, %13 : tensor<3x?x?x?xf32>, tensor<?xf32>) outs(%14 : tensor<3x?x?x?xf32>) {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32): // no predecessors
%16 = arith.addf %arg7, %cst_2 : f32
%17 = arith.divf %cst_1, %16 : f32
%18 = arith.mulf %arg6, %17 : f32
%19 = arith.maxf %18, %cst_0 : f32
%20 = arith.minf %19, %cst_1 : f32
%21 = arith.addf %20, %cst : f32
%22 = math.floor %21 : f32
%23 = arith.divf %22, %17 : f32
linalg.yield %23 : f32
} -> tensor<3x?x?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [0, %arg3, %arg4, %arg5], sizes = [3, %9, %10, %11], strides = [1, 1, 1, 1] : tensor<3x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:3x3x64x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_29 {
stream.executable.export public @predict_dispatch_29
builtin.module {
func @predict_dispatch_29(%arg0: !stream.binding, %arg1: !stream.binding) {
%c56 = arith.constant 56 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:1x58x58x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x56x56x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%2 = affine.apply #map15()[%workgroup_size_2, %workgroup_id_2]
%3 = affine.apply #map15()[%workgroup_size_2, %workgroup_count_2]
scf.for %arg2 = %2 to %c56 step %3 {
%4 = affine.min #map22(%arg2)[%workgroup_size_2]
%5 = affine.apply #map15()[%workgroup_size_1, %workgroup_id_1]
%6 = affine.apply #map15()[%workgroup_size_1, %workgroup_count_1]
scf.for %arg3 = %5 to %c56 step %6 {
%7 = affine.min #map22(%arg3)[%workgroup_size_1]
%8 = affine.apply #map15()[%workgroup_size_0, %workgroup_id_0]
%9 = affine.apply #map15()[%workgroup_size_0, %workgroup_count_0]
scf.for %arg4 = %8 to %c64 step %9 {
%10 = affine.min #map9(%arg4)[%workgroup_size_0]
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg2, %arg3, %arg4], sizes = [1, %4, %7, %10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x64xf32> -> tensor<1x?x?x?xf32>
%12 = affine.apply #map32(%arg2)
%13 = affine.apply #map32(%arg3)
flow.dispatch.tensor.store %11, %0, offsets = [0, %12, %13, %arg4], sizes = [1, %4, %7, %10], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<readwrite:1x58x58x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_30 {
stream.executable.export public @predict_dispatch_30
builtin.module {
func @predict_dispatch_30(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: !stream.binding, %arg5: !stream.binding, %arg6: !stream.binding) {
%cst = arith.constant 9.99999974E-6 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%c56 = arith.constant 56 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64xf32>
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:1x58x58x64xf32>
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3x3x64x64xf32>
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%7 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%8 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg7 = %7 to %c56 step %8 {
%9 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%10 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg8 = %9 to %c56 step %10 {
%11 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%12 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg9 = %11 to %c64 step %12 {
%13 = affine.min #map22(%arg7)[%workgroup_size_2]
%14 = affine.min #map22(%arg8)[%workgroup_size_1]
%15 = affine.min #map9(%arg9)[%workgroup_size_0]
%16 = flow.dispatch.tensor.load %0, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%17 = flow.dispatch.tensor.load %1, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%18 = flow.dispatch.tensor.load %2, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%19 = flow.dispatch.tensor.load %3, offsets = [%arg9], sizes = [%15], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%20 = linalg.init_tensor [1, %13, %14, %15] : tensor<1x?x?x?xf32>
%21 = affine.min #map33(%arg7, %13)
%22 = affine.min #map33(%arg8, %14)
%23 = flow.dispatch.tensor.load %4, offsets = [0, %arg7, %arg8, 0], sizes = [1, %21, %22, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x58x58x64xf32> -> tensor<1x?x?x64xf32>
%24 = affine.min #map11(%arg9)[%workgroup_size_0]
%25 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0, %arg9], sizes = [3, 3, 64, %24], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x64x64xf32> -> tensor<3x3x64x?xf32>
%26 = affine.min #map23(%arg7)[%workgroup_size_2]
%27 = affine.min #map23(%arg8)[%workgroup_size_1]
%28 = linalg.init_tensor [1, %26, %27, %24] : tensor<1x?x?x?xf32>
%29 = linalg.fill(%cst_0, %28) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%30 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%23, %25 : tensor<1x?x?x64xf32>, tensor<3x3x64x?xf32>) outs(%29 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%31 = linalg.generic {indexing_maps = [#map10, #map14, #map14, #map14, #map14, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30, %16, %17, %18, %19 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%20 : tensor<1x?x?x?xf32>) {
^bb0(%arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32): // no predecessors
%32 = arith.addf %arg12, %cst : f32
%33 = math.rsqrt %32 : f32
%34 = arith.mulf %33, %arg13 : f32
%35 = arith.subf %arg10, %arg11 : f32
%36 = arith.mulf %35, %34 : f32
%37 = arith.addf %36, %arg14 : f32
%38 = arith.maxf %37, %cst_0 : f32
linalg.yield %38 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %31, %6, offsets = [0, %arg7, %arg8, %arg9], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x64xf32>
}
}
}
return
}
}
}
stream.executable private @predict_dispatch_37 {
stream.executable.export public @predict_dispatch_37
builtin.module {
func @predict_dispatch_37(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c3136 = arith.constant 3136 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3136x64xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x256xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3136x256xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%4 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %3 to %c3136 step %4 {
%5 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%6 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %5 to %c256 step %6 {
%7 = affine.min #map30(%arg3)[%workgroup_size_1]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg3, 0], sizes = [%7, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x64xf32> -> tensor<?x64xf32>
%9 = affine.min #map24(%arg4)[%workgroup_size_0]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg4], sizes = [64, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x256xf32> -> tensor<64x?xf32>
%11 = affine.min #map31(%arg3)[%workgroup_size_1]
%12 = affine.min #map26(%arg4)[%workgroup_size_0]
%13 = linalg.init_tensor [%11, %12] : tensor<?x?xf32>
%14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%15 = linalg.matmul ins(%8, %10 : tensor<?x64xf32>, tensor<64x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [%arg3, %arg4], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:3136x256xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_38 {
stream.executable.export public @predict_dispatch_38
builtin.module {
func @predict_dispatch_38(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: !stream.binding, %arg5: !stream.binding, %arg6: !stream.binding, %arg7: !stream.binding, %arg8: !stream.binding, %arg9: !stream.binding, %arg10: !stream.binding, %arg11: !stream.binding) {
%cst = arith.constant 9.99999974E-6 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%c3136 = arith.constant 3136 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3136x256xf32>
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%7 = stream.binding.subspan %arg7[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%8 = stream.binding.subspan %arg8[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%9 = stream.binding.subspan %arg9[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3136x64xf32>
%10 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:64x256xf32>
%11 = stream.binding.subspan %arg11[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3136x256xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%12 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%13 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg12 = %12 to %c3136 step %13 {
%14 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%15 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg13 = %14 to %c256 step %15 {
%16 = affine.min #map24(%arg13)[%workgroup_size_0]
%17 = flow.dispatch.tensor.load %0, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%18 = flow.dispatch.tensor.load %1, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%19 = flow.dispatch.tensor.load %2, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%20 = flow.dispatch.tensor.load %3, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%21 = affine.min #map30(%arg12)[%workgroup_size_1]
%22 = flow.dispatch.tensor.load %4, offsets = [%arg12, %arg13], sizes = [%21, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x256xf32> -> tensor<?x?xf32>
%23 = flow.dispatch.tensor.load %5, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%24 = flow.dispatch.tensor.load %6, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%25 = flow.dispatch.tensor.load %7, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%26 = flow.dispatch.tensor.load %8, offsets = [%arg13], sizes = [%16], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%27 = linalg.init_tensor [%21, %16] : tensor<?x?xf32>
%28 = affine.min #map31(%arg12)[%workgroup_size_1]
%29 = flow.dispatch.tensor.load %9, offsets = [%arg12, 0], sizes = [%28, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x64xf32> -> tensor<?x64xf32>
%30 = affine.min #map26(%arg13)[%workgroup_size_0]
%31 = flow.dispatch.tensor.load %10, offsets = [0, %arg13], sizes = [64, %30], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x256xf32> -> tensor<64x?xf32>
%32 = linalg.init_tensor [%28, %30] : tensor<?x?xf32>
%33 = linalg.fill(%cst_0, %32) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%34 = linalg.matmul ins(%29, %31 : tensor<?x64xf32>, tensor<64x?xf32>) outs(%33 : tensor<?x?xf32>) -> tensor<?x?xf32>
%35 = linalg.generic {indexing_maps = [#map25, #map29, #map29, #map29, #map29, #map25, #map29, #map29, #map29, #map29, #map25], iterator_types = ["parallel", "parallel"]} ins(%34, %17, %18, %19, %20, %22, %23, %24, %25, %26 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%27 : tensor<?x?xf32>) {
^bb0(%arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32, %arg19: f32, %arg20: f32, %arg21: f32, %arg22: f32, %arg23: f32, %arg24: f32): // no predecessors
%36 = arith.addf %arg16, %cst : f32
%37 = math.rsqrt %36 : f32
%38 = arith.mulf %37, %arg17 : f32
%39 = arith.subf %arg14, %arg15 : f32
%40 = arith.mulf %39, %38 : f32
%41 = arith.addf %arg21, %cst : f32
%42 = math.rsqrt %41 : f32
%43 = arith.mulf %42, %arg22 : f32
%44 = arith.subf %arg19, %arg20 : f32
%45 = arith.mulf %44, %43 : f32
%46 = arith.addf %45, %arg23 : f32
%47 = arith.addf %40, %arg18 : f32
%48 = arith.addf %47, %46 : f32
%49 = arith.maxf %48, %cst_0 : f32
linalg.yield %49 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %35, %11, offsets = [%arg12, %arg13], sizes = [%21, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:3136x256xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_39 {
stream.executable.export public @predict_dispatch_39
builtin.module {
func @predict_dispatch_39(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 2.550000e+02 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant 2.560000e+02 : f32
%cst_2 = arith.constant 1.1920929E-7 : f32
%c3136 = arith.constant 3136 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:3136x256xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:3136x256xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%4 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %3 to %c3136 step %4 {
%5 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%6 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %5 to %c256 step %6 {
%7 = affine.min #map30(%arg3)[%workgroup_size_1]
%8 = affine.min #map24(%arg4)[%workgroup_size_0]
%9 = flow.dispatch.tensor.load %0, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x256xf32> -> tensor<?x?xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [%arg4], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%11 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%12 = linalg.generic {indexing_maps = [#map25, #map29, #map25], iterator_types = ["parallel", "parallel"]} ins(%9, %10 : tensor<?x?xf32>, tensor<?xf32>) outs(%11 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%13 = arith.addf %arg6, %cst_2 : f32
%14 = arith.divf %cst_1, %13 : f32
%15 = arith.mulf %arg5, %14 : f32
%16 = math.floor %15 : f32
%17 = arith.maxf %16, %cst_0 : f32
%18 = arith.minf %17, %cst : f32
%19 = arith.divf %18, %14 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %12, %2, offsets = [%arg3, %arg4], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:3136x256xf32>
}
}
return
}
}
}
stream.executable private @predict_dispatch_40 {
stream.executable.export public @predict_dispatch_40
builtin.module {
func @predict_dispatch_40(%arg0: !stream.binding, %arg1: !stream.binding) {
%cst = arith.constant -1.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:256xi1>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c256 step %3 {
%4 = affine.min #map24(%arg2)[%workgroup_size_0]
%5 = flow.dispatch.tensor.load %0, offsets = [%arg2], sizes = [%4], strides = [1] : !flow.dispatch.tensor<readonly:256xf32> -> tensor<?xf32>
%6 = linalg.init_tensor [%4] : tensor<?xi1>
%7 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%5 : tensor<?xf32>) outs(%6 : tensor<?xi1>) {
^bb0(%arg3: f32, %arg4: i1): // no predecessors
%8 = arith.cmpf oeq, %arg3, %cst : f32
linalg.yield %8 : i1
} -> tensor<?xi1>
flow.dispatch.tensor.store %7, %1, offsets = [%arg2], sizes = [%4], strides = [1] : tensor<?xi1> -> !flow.dispatch.tensor<writeonly:256xi1>
}
return
}
}
}
stream.executable private @predict_dispatch_41 {
stream.executable.export public @predict_dispatch_41
builtin.module {
func @predict_dispatch_41(%arg0: !stream.binding, %arg1: !stream.binding) {
%true = arith.constant true
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:256xi1>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:i32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:256xi1> -> tensor<256xi1>
%3 = linalg.init_tensor [] : tensor<i32>
%4 = linalg.init_tensor [] : tensor<i1>
%5 = linalg.fill(%true, %4) : i1, tensor<i1> -> tensor<i1>
%6 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%2 : tensor<256xi1>) outs(%5 : tensor<i1>) {
^bb0(%arg2: i1, %arg3: i1): // no predecessors
%8 = arith.andi %arg2, %arg3 : i1
linalg.yield %8 : i1
} -> tensor<i1>
%7 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = []} ins(%6 : tensor<i1>) outs(%3 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32): // no predecessors
%8 = arith.xori %arg2, %true : i1
%9 = arith.extui %8 : i1 to i32
linalg.yield %9 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %7, %1, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
return
}
}
}
func @predict(%arg0: !hal.buffer_view) -> !stream.resource<transient> attributes {iree.abi.stub} {
%c3 = arith.constant 3 : index
%c224 = arith.constant 224 : index
%c1 = arith.constant 1 : index
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c7 = arith.constant 7 : index
%c1024 = arith.constant 1024 : index
%c3136 = arith.constant 3136 : index
%c256 = arith.constant 256 : index
%c56 = arith.constant 56 : index
%c112 = arith.constant 112 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant 0xFF800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%c602112 = arith.constant 602112 : index
%c12 = arith.constant 12 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c37632 = arith.constant 37632 : index
%c634800 = arith.constant 634800 : index
%c3211264 = arith.constant 3211264 : index
%c3268864 = arith.constant 3268864 : index
%c802816 = arith.constant 802816 : index
%c65536 = arith.constant 65536 : index
%c16384 = arith.constant 16384 : index
%c147456 = arith.constant 147456 : index
%c861184 = arith.constant 861184 : index
%_variables$7 = util.global.load @_variables$7 : !stream.resource<constant>
%_variables$7__size = util.global.load @_variables$7__size : index
%_variables$6 = util.global.load @_variables$6 : !stream.resource<constant>
%_variables$6__size = util.global.load @_variables$6__size : index
%_variables$590 = util.global.load @_variables$590 : !stream.resource<constant>
%_variables$590__size = util.global.load @_variables$590__size : index
%_variables$589 = util.global.load @_variables$589 : !stream.resource<constant>
%_variables$589__size = util.global.load @_variables$589__size : index
%_variables$588 = util.global.load @_variables$588 : !stream.resource<constant>
%_variables$588__size = util.global.load @_variables$588__size : index
%_variables$5 = util.global.load @_variables$5 : !stream.resource<constant>
%_variables$5__size = util.global.load @_variables$5__size : index
%_variables$443 = util.global.load @_variables$443 : !stream.resource<constant>
%_variables$442 = util.global.load @_variables$442 : !stream.resource<constant>
%_variables$442__size = util.global.load @_variables$442__size : index
%_variables$441 = util.global.load @_variables$441 : !stream.resource<constant>
%_variables$441__size = util.global.load @_variables$441__size : index
%_variables$440 = util.global.load @_variables$440 : !stream.resource<constant>
%_variables$439 = util.global.load @_variables$439 : !stream.resource<constant>
%_variables$439__size = util.global.load @_variables$439__size : index
%_variables$438 = util.global.load @_variables$438 : !stream.resource<constant>
%_variables$437 = util.global.load @_variables$437 : !stream.resource<constant>
%_variables$437__size = util.global.load @_variables$437__size : index
%_variables$436 = util.global.load @_variables$436 : !stream.resource<constant>
%_variables$436__size = util.global.load @_variables$436__size : index
%_variables$435 = util.global.load @_variables$435 : !stream.resource<constant>
%_variables$435__size = util.global.load @_variables$435__size : index
%_variables$434 = util.global.load @_variables$434 : !stream.resource<constant>
%_variables$434__size = util.global.load @_variables$434__size : index
%_variables$433 = util.global.load @_variables$433 : !stream.resource<constant>
%_variables$433__size = util.global.load @_variables$433__size : index
%_variables$432 = util.global.load @_variables$432 : !stream.resource<constant>
%_variables$432__size = util.global.load @_variables$432__size : index
%_variables$424 = util.global.load @_variables$424 : !stream.resource<constant>
%_variables$4 = util.global.load @_variables$4 : !stream.resource<constant>
%_variables$4__size = util.global.load @_variables$4__size : index
%_variables$3 = util.global.load @_variables$3 : !stream.resource<constant>
%_variables$3__size = util.global.load @_variables$3__size : index
%_variables$2 = util.global.load @_variables$2 : !stream.resource<constant>
%_variables$2__size = util.global.load @_variables$2__size : index
%_variables$136 = util.global.load @_variables$136 : !stream.resource<constant>
%_variables$130 = util.global.load @_variables$130 : !stream.resource<constant>
%_variables$124 = util.global.load @_variables$124 : !stream.resource<constant>
%_variables$118 = util.global.load @_variables$118 : !stream.resource<constant>
%_variables$112 = util.global.load @_variables$112 : !stream.resource<constant>
%_variables$105 = util.global.load @_variables$105 : !stream.resource<constant>
%_variables$105__size = util.global.load @_variables$105__size : index
%_variables$104 = util.global.load @_variables$104 : !stream.resource<constant>
%_variables$104__size = util.global.load @_variables$104__size : index
%_variables$1 = util.global.load @_variables$1 : !stream.resource<constant>
%_variables$1__size = util.global.load @_variables$1__size : index
%_variables$0 = util.global.load @_variables$0 : !stream.resource<constant>
%_variables$0__size = util.global.load @_variables$0__size : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c224, %c224, %c3]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x224x224x3xf32> in !stream.resource<external>{%c602112}
%1 = stream.async.dispatch @predict_dispatch_0::@predict_dispatch_0[%c3, %c224, %c224](%0, %_variables$424) : (!stream.resource<external>{%c602112}, !stream.resource<constant>{%c12}) -> !stream.resource<external>{%c602112}
%2 = stream.async.dispatch @predict_dispatch_1::@predict_dispatch_1[%c3, %c1, %c1](%_variables$424) : (!stream.resource<constant>{%c12}) -> !stream.resource<transient>{%c3}
%3 = stream.async.dispatch @predict_dispatch_2::@predict_dispatch_2[%c1, %c1, %c1](%2) : (!stream.resource<transient>{%c3}) -> !stream.resource<transient>{%c4}
%4 = stream.async.transfer %3 : !stream.resource<transient>{%c4} -> !stream.resource<staging>{%c4}
%5 = stream.async.load %4[%c0] : !stream.resource<staging>{%c4} -> i32
%6 = arith.cmpi eq, %5, %c0_i32 : i32
%7 = select %6, %0, %1 : !stream.resource<external>
%8 = stream.async.dispatch @predict_dispatch_3::@predict_dispatch_3[%c64, %c3, %c7](%_variables$590) : (!stream.resource<constant>{%_variables$590__size}) -> !stream.resource<transient>{%c37632}
%9 = stream.async.dispatch @predict_dispatch_4::@predict_dispatch_4[%c64, %c1, %c1](%8) : (!stream.resource<transient>{%c37632}) -> !stream.resource<transient>{%c256}
%10 = stream.async.dispatch @predict_dispatch_5::@predict_dispatch_5[%c64, %c3, %c7](%_variables$590, %9) : (!stream.resource<constant>{%_variables$590__size}, !stream.resource<transient>{%c256}) -> !stream.resource<transient>{%c37632}
%11 = stream.async.splat %cst_0 : f32 -> !stream.resource<transient>{%c634800}
%12 = stream.async.dispatch @predict_dispatch_6::@predict_dispatch_6[%c3, %c224, %c224](%11, %7) : (!stream.resource<transient>{%c634800}, !stream.resource<external>{%c602112}) -> %11{%c634800}
%13 = stream.async.dispatch @predict_dispatch_7::@predict_dispatch_7[%c64, %c112, %c112](%_variables$104, %_variables$105, %_variables$589, %_variables$588, %12, %10) : (!stream.resource<constant>{%_variables$104__size}, !stream.resource<constant>{%_variables$105__size}, !stream.resource<constant>{%_variables$589__size}, !stream.resource<constant>{%_variables$588__size}, !stream.resource<transient>{%c634800}, !stream.resource<transient>{%c37632}) -> !stream.resource<transient>{%c3211264}
%14 = stream.async.splat %cst : f32 -> !stream.resource<transient>{%c3268864}
%15 = stream.async.dispatch @predict_dispatch_8::@predict_dispatch_8[%c64, %c112, %c112](%14, %13) : (!stream.resource<transient>{%c3268864}, !stream.resource<transient>{%c3211264}) -> %14{%c3268864}
%16 = stream.async.dispatch @predict_dispatch_9::@predict_dispatch_9[%c64, %c56, %c56](%15) : (!stream.resource<transient>{%c3268864}) -> !stream.resource<transient>{%c802816}
%17 = stream.async.dispatch @predict_dispatch_10::@predict_dispatch_10[%c64, %c56, %c56](%16, %_variables$130) : (!stream.resource<transient>{%c802816}, !stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c802816}
%18 = stream.async.dispatch @predict_dispatch_11::@predict_dispatch_11[%c64, %c1, %c1](%_variables$130) : (!stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c64}
%19 = stream.async.dispatch @predict_dispatch_12::@predict_dispatch_12[%c1, %c1, %c1](%18) : (!stream.resource<transient>{%c64}) -> !stream.resource<transient>{%c4}
%20 = stream.async.transfer %19 : !stream.resource<transient>{%c4} -> !stream.resource<staging>{%c4}
%21 = stream.async.load %20[%c0] : !stream.resource<staging>{%c4} -> i32
%22 = arith.cmpi eq, %21, %c0_i32 : i32
%23 = select %22, %16, %17 : !stream.resource<transient>
%24 = stream.async.dispatch @predict_dispatch_13::@predict_dispatch_13[%c256, %c64, %c1](%_variables$443) : (!stream.resource<constant>{%c65536}) -> !stream.resource<transient>{%c65536}
%25 = stream.async.dispatch @predict_dispatch_14::@predict_dispatch_14[%c256, %c1, %c1](%24) : (!stream.resource<transient>{%c65536}) -> !stream.resource<transient>{%c1024}
%26 = stream.async.dispatch @predict_dispatch_15::@predict_dispatch_15[%c256, %c64, %c1](%_variables$443, %25) : (!stream.resource<constant>{%c65536}, !stream.resource<transient>{%c1024}) -> !stream.resource<transient>{%c65536}
%27 = stream.async.dispatch @predict_dispatch_16::@predict_dispatch_16[%c64, %c56, %c56](%16, %_variables$112) : (!stream.resource<transient>{%c802816}, !stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c802816}
%28 = stream.async.dispatch @predict_dispatch_11::@predict_dispatch_11[%c64, %c1, %c1](%_variables$112) : (!stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c64}
%29 = stream.async.dispatch @predict_dispatch_12::@predict_dispatch_12[%c1, %c1, %c1](%28) : (!stream.resource<transient>{%c64}) -> !stream.resource<transient>{%c4}
%30 = stream.async.transfer %29 : !stream.resource<transient>{%c4} -> !stream.resource<staging>{%c4}
%31 = stream.async.load %30[%c0] : !stream.resource<staging>{%c4} -> i32
%32 = arith.cmpi eq, %31, %c0_i32 : i32
%33 = select %32, %16, %27 : !stream.resource<transient>
%34 = stream.async.dispatch @predict_dispatch_19::@predict_dispatch_19[%c64, %c64, %c1](%_variables$438) : (!stream.resource<constant>{%c16384}) -> !stream.resource<transient>{%c16384}
%35 = stream.async.dispatch @predict_dispatch_20::@predict_dispatch_20[%c64, %c1, %c1](%34) : (!stream.resource<transient>{%c16384}) -> !stream.resource<transient>{%c256}
%36 = stream.async.dispatch @predict_dispatch_21::@predict_dispatch_21[%c64, %c64, %c1](%_variables$438, %35) : (!stream.resource<constant>{%c16384}, !stream.resource<transient>{%c256}) -> !stream.resource<transient>{%c16384}
%37 = stream.async.dispatch @predict_dispatch_22::@predict_dispatch_22[%c64, %c3136, %c1](%_variables$0, %_variables$1, %_variables$433, %_variables$432, %33, %36) : (!stream.resource<constant>{%_variables$0__size}, !stream.resource<constant>{%_variables$1__size}, !stream.resource<constant>{%_variables$433__size}, !stream.resource<constant>{%_variables$432__size}, !stream.resource<transient>{%c802816}, !stream.resource<transient>{%c16384}) -> !stream.resource<transient>{%c802816}
%38 = stream.async.dispatch @predict_dispatch_23::@predict_dispatch_23[%c64, %c3136, %c1](%37, %_variables$118) : (!stream.resource<transient>{%c802816}, !stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c802816}
%39 = stream.async.dispatch @predict_dispatch_11::@predict_dispatch_11[%c64, %c1, %c1](%_variables$118) : (!stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c64}
%40 = stream.async.dispatch @predict_dispatch_12::@predict_dispatch_12[%c1, %c1, %c1](%39) : (!stream.resource<transient>{%c64}) -> !stream.resource<transient>{%c4}
%41 = stream.async.transfer %40 : !stream.resource<transient>{%c4} -> !stream.resource<staging>{%c4}
%42 = stream.async.load %41[%c0] : !stream.resource<staging>{%c4} -> i32
%43 = arith.cmpi eq, %42, %c0_i32 : i32
%44 = select %43, %37, %38 : !stream.resource<transient>
%45 = stream.async.dispatch @predict_dispatch_26::@predict_dispatch_26[%c64, %c64, %c3](%_variables$439) : (!stream.resource<constant>{%_variables$439__size}) -> !stream.resource<transient>{%c147456}
%46 = stream.async.dispatch @predict_dispatch_27::@predict_dispatch_27[%c64, %c1, %c1](%45) : (!stream.resource<transient>{%c147456}) -> !stream.resource<transient>{%c256}
%47 = stream.async.dispatch @predict_dispatch_28::@predict_dispatch_28[%c64, %c64, %c3](%_variables$439, %46) : (!stream.resource<constant>{%_variables$439__size}, !stream.resource<transient>{%c256}) -> !stream.resource<transient>{%c147456}
%48 = stream.async.splat %cst_0 : f32 -> !stream.resource<transient>{%c861184}
%49 = stream.async.dispatch @predict_dispatch_29::@predict_dispatch_29[%c64, %c56, %c56](%48, %44) : (!stream.resource<transient>{%c861184}, !stream.resource<transient>{%c802816}) -> %48{%c861184}
%50 = stream.async.dispatch @predict_dispatch_30::@predict_dispatch_30[%c64, %c56, %c56](%_variables$2, %_variables$3, %_variables$435, %_variables$434, %49, %47) : (!stream.resource<constant>{%_variables$2__size}, !stream.resource<constant>{%_variables$3__size}, !stream.resource<constant>{%_variables$435__size}, !stream.resource<constant>{%_variables$434__size}, !stream.resource<transient>{%c861184}, !stream.resource<transient>{%c147456}) -> !stream.resource<transient>{%c802816}
%51 = stream.async.dispatch @predict_dispatch_10::@predict_dispatch_10[%c64, %c56, %c56](%50, %_variables$124) : (!stream.resource<transient>{%c802816}, !stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c802816}
%52 = stream.async.dispatch @predict_dispatch_11::@predict_dispatch_11[%c64, %c1, %c1](%_variables$124) : (!stream.resource<constant>{%c256}) -> !stream.resource<transient>{%c64}
%53 = stream.async.dispatch @predict_dispatch_12::@predict_dispatch_12[%c1, %c1, %c1](%52) : (!stream.resource<transient>{%c64}) -> !stream.resource<transient>{%c4}
%54 = stream.async.transfer %53 : !stream.resource<transient>{%c4} -> !stream.resource<staging>{%c4}
%55 = stream.async.load %54[%c0] : !stream.resource<staging>{%c4} -> i32
%56 = arith.cmpi eq, %55, %c0_i32 : i32
%57 = select %56, %50, %51 : !stream.resource<transient>
%58 = stream.async.dispatch @predict_dispatch_13::@predict_dispatch_13[%c256, %c64, %c1](%_variables$440) : (!stream.resource<constant>{%c65536}) -> !stream.resource<transient>{%c65536}
%59 = stream.async.dispatch @predict_dispatch_14::@predict_dispatch_14[%c256, %c1, %c1](%58) : (!stream.resource<transient>{%c65536}) -> !stream.resource<transient>{%c1024}
%60 = stream.async.dispatch @predict_dispatch_15::@predict_dispatch_15[%c256, %c64, %c1](%_variables$440, %59) : (!stream.resource<constant>{%c65536}, !stream.resource<transient>{%c1024}) -> !stream.resource<transient>{%c65536}
%61 = stream.async.dispatch @predict_dispatch_37::@predict_dispatch_37[%c256, %c3136, %c1](%57, %60) : (!stream.resource<transient>{%c802816}, !stream.resource<transient>{%c65536}) -> !stream.resource<transient>{%c3211264}
%62 = stream.async.dispatch @predict_dispatch_38::@predict_dispatch_38[%c256, %c3136, %c1](%_variables$6, %_variables$7, %_variables$442, %_variables$441, %61, %_variables$4, %_variables$5, %_variables$437, %_variables$436, %23, %26) : (!stream.resource<constant>{%_variables$6__size}, !stream.resource<constant>{%_variables$7__size}, !stream.resource<constant>{%_variables$442__size}, !stream.resource<constant>{%_variables$441__size}, !stream.resource<transient>{%c3211264}, !stream.resource<constant>{%_variables$4__size}, !stream.resource<constant>{%_variables$5__size}, !stream.resource<constant>{%_variables$437__size}, !stream.resource<constant>{%_variables$436__size}, !stream.resource<transient>{%c802816}, !stream.resource<transient>{%c65536}) -> !stream.resource<transient>{%c3211264}
%63 = stream.async.dispatch @predict_dispatch_39::@predict_dispatch_39[%c256, %c3136, %c1](%62, %_variables$136) : (!stream.resource<transient>{%c3211264}, !stream.resource<constant>{%c1024}) -> !stream.resource<transient>{%c3211264}
%64 = stream.async.dispatch @predict_dispatch_40::@predict_dispatch_40[%c256, %c1, %c1](%_variables$136) : (!stream.resource<constant>{%c1024}) -> !stream.resource<transient>{%c256}
%65 = stream.async.dispatch @predict_dispatch_41::@predict_dispatch_41[%c1, %c1, %c1](%64) : (!stream.resource<transient>{%c256}) -> !stream.resource<transient>{%c4}
%66 = stream.async.transfer %65 : !stream.resource<transient>{%c4} -> !stream.resource<staging>{%c4}
%67 = stream.async.load %66[%c0] : !stream.resource<staging>{%c4} -> i32
%68 = arith.cmpi eq, %67, %c0_i32 : i32
%69 = select %68, %62, %63 : !stream.resource<transient>
return %69 : !stream.resource<transient>
}
}
A minimal repro:
stream.executable private @ex {
stream.executable.export public @dispatch0
stream.executable.export public @dispatch1
stream.executable.export public @dispatch2
builtin.module {
func @dispatch0(%arg0: !stream.binding, %arg1: !stream.binding) {
return
}
func @dispatch1(%arg0: !stream.binding, %arg1: !stream.binding) {
return
}
func @dispatch2(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
return
}
}
}
func @main(%arg0: i1) -> !stream.resource<transient> {
%c1 = arith.constant 1 : index
%c128 = arith.constant 128 : index
%cst = arith.constant 0xFF800000 : f32
%0 = stream.async.splat %cst : f32 -> !stream.resource<transient>{%c128}
%1 = stream.async.dispatch @ex::@dispatch0[%c1, %c1, %c1](%0) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
%2 = stream.async.dispatch @ex::@dispatch1[%c1, %c1, %c1](%1) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
%3 = select %arg0, %1, %2 : !stream.resource<transient>
%4 = stream.async.dispatch @ex::@dispatch2[%c1, %c1, %c1](%1, %3) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
return %4 : !stream.resource<transient>
}
Input program: https://gist.github.com/stellaraccident/dc82380c55e68fbd726ce22b430b236d
Error:
Note that this model isn't great -- in a real inference situation, a number of the things inducing branchiness would be constants. However, it is important that this works at this level of generality.