On an example generator taken directly from test/performance/nested_vectorization_gemm.cpp, flatten_nested_ramps() takes approximately 5min on my machine:
#include "Halide.h"
namespace {
class HalideDP : public Halide::Generator<HalideDP> {
public:
GeneratorParam<int> size{"size", 1024};
Input<Buffer<uint8_t>> f{"f", 2};
Input<Buffer<uint8_t>> g{"g", 2};
Output<Buffer<int32_t>> prod{"prod", 2};
void generate() {
Halide::Var x("x"), y("y");
// matrix multiplication
Halide::RDom r(0, 128);
prod(x, y) += cast<int32_t>(f(x, r)) * g(r, y);
Halide::Var xi("xi"), yi("yi");
Halide::RVar ro("ro"), ri("ri");
// x86 schedule. Exploits the ability of pmaddwd
// to pull one arg from memory. Because we'll be
// intentionally spilling, the tile will be
// absurdly large for a gemm.
const int vec = natural_vector_size<uint8_t>();
prod.bound(x, 0, size)
.bound(y, 0, size);
prod.in()
.tile(x, y, xi, yi, vec, vec / 2)
.vectorize(xi)
.unroll(yi);
f.in().compute_at(prod, ro).vectorize(Halide::_0).unroll(Halide::_1);
g.in().compute_at(prod, y).vectorize(Halide::_0).unroll(Halide::_1);
prod.compute_at(prod.in(), x)
.vectorize(x)
.unroll(y)
.update()
.split(r, ro, ri, vec / 2)
.reorder(ri, x, y, ro)
.vectorize(x)
.unroll(y)
.atomic()
.vectorize(ri, 2)
.unroll(ri);
}
};
} // namespace
HALIDE_REGISTER_GENERATOR(HalideDP, halide_dp)
Without -t 0 to the generator, it times out sometime in generating llvm bitcode for the function. The stmt code is quite large, I suspect something is just inefficient in flatten_nested_ramps
On an example generator taken directly from
test/performance/nested_vectorization_gemm.cpp
,flatten_nested_ramps()
takes approximately 5min on my machine:Build:
Without
-t 0
to the generator, it times out sometime in generating llvm bitcode for the function. The stmt code is quite large, I suspect something is just inefficient inflatten_nested_ramps