cornell-zhang / heterocl

HeteroCL: A Multi-Paradigm Programming Infrastructure for Software-Defined Heterogeneous Computing
https://cornell-zhang.github.io/heterocl/
Apache License 2.0
324 stars 92 forks source link

Fail to Reorder Reduction Loops #489

Open sqPoseidon opened 1 year ago

sqPoseidon commented 1 year ago

In packed_conv2d_nchw function, there're four reduction loops: in_channel, kernel_h, kernel_w, and bitwidth. When I try to move the output channel loop into the reduction loops, I get the error message:

heterocl-mlir/hcl-dialect/llvm-project/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp:1496: 
unsigned int mlir::permuteLoops(llvm::MutableArrayRef<mlir::AffineForOp>, llvm::ArrayRef<unsigned int>): 
Assertion `false && "invalid permutation map"' failed.

Here's the example:

import heterocl as hcl
import heterocl.op.bnn as bnn
import numpy as np

def test_bconv_popcnt():
    packing_factor=8
    out_channel = 64
    strides = (1, 1)
    padding = (1, 1)
    in_channel = 8
    bitwidth = min(in_channel, packing_factor)
    in_dtype = hcl.Float()
    out_dtype = hcl.Float()
    in_shape = (1, in_channel, 3, 3) # n, c, h, w
    weight_shape = (out_channel, in_channel, 3, 3) # o, i, h, w
    out_shape = (1, out_channel, 3, 3)

    def conv(data, weight):
        data = hcl.compute(
            data.shape,
            lambda *args: hcl.select(data[args] > 0, 1, 0),
            name="data",
            dtype=hcl.UInt(1),
        )
        weight = hcl.compute(
            weight.shape,
            lambda *args: hcl.select(weight[args] > 0, 1, 0),
            name="weight",
            dtype=hcl.UInt(1),
        )
        # pack along channel dimension
        packed_data = hcl.pack(
            data,
            axis=1,
            factor=bitwidth,
            name="conv_packed",
            dtype=hcl.UInt(bitwidth),
        )
        packed_weight = hcl.pack(
            weight,
            axis=1,
            factor=bitwidth,
            name="conv_packed",
            dtype=hcl.UInt(bitwidth),
        )
        return bnn.packed_conv2d_nchw(
            packed_data,
            packed_weight,
            strides=strides,
            padding=padding,
            name="conv_conv2d",
            out_dtype=out_dtype,
        )

    data = hcl.placeholder(in_shape, "data", dtype=in_dtype)
    weight = hcl.placeholder(weight_shape, "weight", dtype=in_dtype)
    s = hcl.create_schedule([data, weight], conv)

    B = getattr(conv, "conv_conv2d")
    print("B.axis: ", B.axis) # nn, ff, yy, xx, conv_conv2d_rc, conv_conv2d_rx, conv_conv2d_ry
    # s[B].reorder(B.axis[0], B.axis[2], B.axis[1])
    s[B].reorder(B.axis[0], B.axis[2], B.axis[3], B.axis[4], B.axis[1], B.axis[5], B.axis[6]) # nn, yy, xx, conv_conv2d_rc, conv_conv2d_rx, conv_conv2d_ry, ff,

    f = hcl.build(s)
    print(f.host_src)

    a_np = np.random.randint(0, 10, in_shape)
    b_np = np.random.randint(0, 10, weight_shape)

    hcl_a = hcl.asarray(a_np, dtype=in_dtype)
    hcl_b = hcl.asarray(b_np, dtype=in_dtype)
    hcl_c = hcl.asarray(np.zeros(out_shape), dtype=hcl.Float())

    f(hcl_a, hcl_b, hcl_c)

    n, c, h, w = in_shape
    o, i, kh, kw = weight_shape
    # binarize a_np, b_np
    a_np = np.where(a_np > 0, 1, -1)
    b_np = np.where(b_np > 0, 1, -1)
    # pad a_np
    a_np = np.pad(a_np, ((0, 0), (0, 0), (1, 1), (1, 1)), 'constant')
    # calculate convolution
    baseline_output = np.zeros((n, o, h, w))
    for i in range(n):
        for j in range(o):
            for k in range(h):
                for l in range(w):
                    for m in range(c):
                        for p in range(kh):
                            for q in range(kw):
                                baseline_output[i][j][k][l] += a_np[i][m][k + p][l + q] * b_np[j][m][p][q]

    assert np.allclose(hcl_c.asnumpy(), baseline_output)

test_bconv_popcnt()
zzzDavid commented 1 year ago

This seems like a limitation of milr::permuteLoops, I will look into this and provide more detail

chhzh123 commented 1 year ago

It's actually our limitation. Currently we put the reduction variable outside all the reduction loops, causing inner loops imperfect, thus we cannot directly permute those reduction loops with spatial loops.