Closed zzzDavid closed 1 year ago
The dumped IR before loop transformation:
module {
func.func @top(%arg0: memref<10x10xi32>) -> memref<8x8xi32> attributes {itypes = "s", otypes = "s"} {
%0 = hcl.create_op_handle "compute_1"
%1 = hcl.create_loop_handle %0, "y"
%2 = hcl.create_loop_handle %0, "x"
%3 = hcl.create_loop_handle %0, "rx_1"
%4 = hcl.create_loop_handle %0, "rx_0"
%5 = memref.alloc() {name = "compute_1"} : memref<8x8xi32>
affine.for %arg1 = 0 to 8 {
affine.for %arg2 = 0 to 8 {
%9 = memref.alloc() {name = "sum_rv"} : memref<1xi32>
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
affine.store %c0_i32, %9[%c0] {to = "sum_rv"} : memref<1xi32>
affine.for %arg3 = 0 to 3 {
affine.for %arg4 = 0 to 3 {
%11 = affine.load %arg0[%arg1 + %arg3, %arg2 + %arg4] {from = "compute_0"} : memref<10x10xi32>
%12 = affine.load %9[%c0] {from = "sum_rv"} : memref<1xi32>
%13 = arith.addi %11, %12 : i32
affine.store %13, %9[%c0] {to = "sum_rv"} : memref<1xi32>
} {loop_name = "rx_1", reduction}
} {loop_name = "rx_0", reduction}
%c0_0 = arith.constant 0 : index
%10 = affine.load %9[%c0_0] {from = "sum_rv"} : memref<1xi32>
affine.store %10, %5[%arg1, %arg2] {to = "compute_1"} : memref<8x8xi32>
} {loop_name = "x"}
} {loop_name = "y", op_name = "compute_1"}
%outer, %inner = hcl.split(%2, 4)
hcl.reorder(%outer, %1, %inner)
%6 = hcl.reuse_at(%arg0 : memref<10x10xi32>, %1) -> memref<1xf32>
%7 = hcl.reuse_at(%6 : memref<1xf32>, %inner) -> memref<1xf32>
hcl.partition(%6 : memref<1xf32>, CompletePartition, 2, 0)
hcl.partition(%7 : memref<1xf32>, CompletePartition, 0, 0)
%8 = hcl.reshape %5 : memref<8x8xi32> to memref<8x2x4xi32>
hcl.pipeline(%1, 1)
return %5 : memref<8x8xi32>
}
}
The error happens during the first reuse_at
's buffer shift. Note that the outer loop has gone through two transformations:
%outer, %inner = hcl.split(%2, 4)
hcl.reorder(%outer, %1, %inner)
The inner loop is first tiled to 2x4, and then the 2 outer loop is reorder to the outmost level.
These transformations changed the number of non-reduction loop from 2 to 3. The affine load operation reading from target memref does not take loop transformation into consideration, so it built three indices:
https://github.com/cornell-zhang/hcl-dialect-prototype/blob/5c10ee6b80ed85e853c2e984e079634bd8a9f711/lib/Transforms/LoopTransformations.cpp#L2024-L2031
Hence the error map.getNumInputs() == mapOperands.size() && "inconsistent index info"
, because we are using three indices to index a rank2 memref
This test/issue is related to applying reuse_at
after the outer non-reduction loops are transformed. In the previous commits, I implemented mechanisms to identify target memref indices from transformed loops' induction vars, and corrected a few steps to make sure the loop bound are correctly updated, load indices from target memref are correct, and store indices to output memref are correct.
@zzzDavid Can you try this avgpool
example? It raises the same error but points to the code you added in these several commits.
import heterocl as hcl
import numpy as np
bs = 4
ic, oc = 16, 16
ih, iw = 8, 8
kh, kw = 2, 2
stride = 2
oh, ow = (ih - kh) // stride + 1, (iw - kw) // stride + 1
dtype = hcl.Float()
def test_avgpool_nchw():
hcl.init(dtype)
A = hcl.placeholder((bs, ic, ih, iw))
def avgpool(A):
rh = hcl.reduce_axis(0, kh)
rw = hcl.reduce_axis(0, kw)
B = hcl.compute(
(bs, oc, oh, ow),
lambda n, c, h, w: hcl.sum(
A[n, c, h * stride + rh, w * stride + rw],
axis=[rh, rw],
dtype=dtype,
) / (kh * kw),
name="B",
dtype=dtype,
)
return B
s = hcl.create_schedule([A], avgpool)
B = avgpool.B
LB = s.reuse_at(A, s[B], B.axis[2])
WB = s.reuse_at(LB, s[B], B.axis[3])
print(hcl.lower(s))
f = hcl.build(s)
np_A = np.random.random((bs, ic, ih, iw))
np_C = np.zeros((bs, oc, oh, ow), dtype="float")
for n in range(0, bs):
for c in range(0, oc):
for y in range(0, oh):
for x in range(0, ow):
for rh in range(0, kh):
for rw in range(0, kw):
np_C[n][c][y][x] += (
np_A[n][c][y * stride + rh][x * stride + rw]
) / (kh * kw)
hcl_A = hcl.asarray(np_A, dtype=dtype)
hcl_C = hcl.asarray(np_C, dtype=dtype)
f(hcl_A, hcl_C)
# print(np_C, hcl_C.asnumpy())
assert np.allclose(np_C, hcl_C.asnumpy())
print("Passed!")
if __name__ == "__main__":
test_avgpool_nchw()
There was an issue with building loadop from target reuse memory when stride > 1. This is fixed by 835d3e0afe7dac9efa42ec84055afb39a52f24ab, and now I can pass this test case along with other test cases in HeteroCL.
However, fracbnn gives wrong result with reuse_at. There's must be cases not covered by the current test suite in fracbnn. I'm working to find out why it's not passing
The above issue is still with stride > 1 reuse_at
. The constant loop bound update did not take stride into account. This issue is fixed by 87ee51469d67eda96be95570c4752581977c15b9.
I have tested with fracbnn, the results are now correct.
I will add the following test case to test suite:
def test_conv2D_lb_wb_stride_2():
hcl.init()
A = hcl.placeholder((10, 10))
r = hcl.reduce_axis(0, 3)
c = hcl.reduce_axis(0, 3)
B = hcl.compute((4, 4), lambda y, x: hcl.sum(A[y * 2 + r, x * 2 + c], axis=[r, c]))
s = hcl.create_schedule([A])
LB = s.reuse_at(A, s[B], B.axis[0])
WB = s.reuse_at(LB, s[B], B.axis[1])
f = hcl.build(s)
np_A = np.random.randint(0, 10, size=(10, 10))
np_B = np.zeros((4, 4), dtype="int")
np_C = np.zeros((4, 4), dtype="int")
for y in range(0, 4):
for x in range(0, 4):
for r in range(0, 3):
for c in range(0, 3):
np_C[y][x] += np_A[y*2+r][x*2+c]
hcl_A = hcl.asarray(np_A)
hcl_B = hcl.asarray(np_B)
f(hcl_A, hcl_B)
np_B = hcl_B.asnumpy()
assert np.array_equal(np_B, np_C)
Added both avg pooling and stride 2 reuse_at as test cases: cornell-zhang/heterocl@e6e23bb9858fadf6c288c17bbaa15507b3b66b33
Related test:
mlir/test_schedule_memory.py::test_conv2D_lb_wb_schedule
Error message: