Open steven-johnson opened 3 years ago
The reason t209
isn't hoisted is because loop invariants are currently hoisted top-down instead of bottom-up, which is a relatively easy fix (that hopefully shouldn't break anything). The reason t153
isn't hoisted is a bit trickier (or just requires a more involved fix) - currently, if a LetStmt or Let is hoisted, the variables that depend on it still view that variable as varying with the current For loop.
A prepass doesn't work that well, because LICM mucks things up by introducing a bunch of LetStmts in the wrong place. I have a possible fix that generates the following IR:
produce output {
let t197 = (float32)sin_f32((float32)rot_radians)
let t194 = (float32)cos_f32((float32)rot_radians)
let t201 = (float32)roi_y_max - (float32)roi_y_min
let t202 = (float32)roi_x_min - (float32)rot_center_x
let t200 = (float32)roi_x_max - (float32)roi_x_min
let t204 = (input.min.1*input.stride.1) + input.min.0
let t206 = input.extent.1 + input.min.1
let t203 = input.extent.0 + input.min.0
let t198 = float32((0 - input.extent.0))
let t195 = float32(input.extent.0)
let t196 = float32(dst_width)
let t199 = float32(dst_height)
let t147 = (t194*t195)/t196
let t148 = int32((float32)round_f32(t147*65536.000000f))
let t150 = (t197*t198)/t199
let t152 = (t197*t200)/t196
let t153 = int32((float32)round_f32(t152*65536.000000f))
let t154 = (t194*t201)/t199
for (output.s0.y, 0, 128) {
let t209 = output.s0.y*output.stride.1
let t208 = (output.s0.y*int32((float32)round_f32(t154*65536.000000f))) + int32((float32)round_f32(((((t152 + t154)*0.500000f) + (((t197*t202) + (((float32)roi_y_min - (float32)rot_center_y)*t194)) + (float32)rot_center_y))*65536.000000f) + -32768.000000f))
let t207 = (output.s0.y*int32((float32)round_f32(t150*65536.000000f))) + int32((float32)round_f32(((((t147 + t150)*0.500000f) + ((((float32)rot_center_y*t197) + ((t194*t202) - ((float32)roi_y_min*t197))) + (float32)rot_center_x))*65536.000000f) + -32768.000000f))
for (output.s0.x.x, 0, 16) {
output[ramp((output.s0.x.x*8) + t209, 1, 8)] = input[(max(min((int32x8)shift_right(ramp(((output.s0.x.x*t153)*8) + t208, t153, 8), x8((uint32)16)), x8(t206 + -1)), x8(input.min.1))*x8(input.stride.1)) + (max(min((int32x8)shift_right(ramp(((output.s0.x.x*t148)*8) + t207, t148, 8), x8((uint32)16)), x8(t203 + -1)), x8(input.min.0)) - x8(t204))]
}
}
}
}
But I am hesitant to introduce it as a PR because it's an ugly fix - it requires running the LetStmt lifter both before and after LICM runs, when I think LICM should be fixed. Would appreciate feedback on that
Oh hmm, that actually seems to have removed some of the invariants that LICM should have gathered somehow...
Imho, the following code should automatically hoist the "invalid" Func to the y loop. I found a case in a much more complex pipeline where LLVM fails to hoist a similar expression. Plus the statements become very hard to read, especially when unrolling is used. Of course with unrolling, common subexpression elimination is probably also needed. Manual hoisting with compute_at has some downsides too, for example it will generate unreasonable overhead (function call) when using the -profile target (imagine hoisting out of a dimension with only an extent of 4).
#include "Halide.h"
using namespace Halide;
using namespace Halide::ConciseCasts;
class HoistTestGenerator : public Halide::Generator<HoistTestGenerator> {
public:
Input<int32_t> numRows{"numRows"}, numCols{"numCols"};
Input<Buffer<uint8_t, 1>> flags{"flags"};
Input<Buffer<uint16_t, 1>> input1{"input1"};
Input<Buffer<uint16_t, 2>> input2{"input2"};
Output<Buffer<uint16_t, 2>> output{"output"};
GeneratorParam<bool> manualHoist{"manualHoist", false};
Func invalid{"invalid"};
Var x{"x"}, y{"y"};
void generate() {
invalid(y) = flags(y) == 64;
output(x, y) = select(invalid(y), 0, input1(x) * input2(x, y));
}
void schedule() {
Expr nR = numRows;
Expr nC = numCols;
flags .dim(0).set_bounds(0, nR).set_stride(1);
input1.dim(0).set_bounds(0, nC).set_stride(1);
input2.dim(0).set_bounds(0, nC).set_stride(1);
input2.dim(1).set_bounds(0, nR).set_stride(nC);
output.dim(0).set_bounds(0, nC).set_stride(1);
output.dim(1).set_bounds(0, nR).set_stride(nC);
if (manualHoist) {
invalid
.compute_at(output, y)
.store_in(MemoryType::Register)
;
}
}
};
HALIDE_REGISTER_GENERATOR(HoistTestGenerator, hoist_test_generator)
manualHoist=false
produce output {
for (output.s0.y, 0, numRows) {
for (output.s0.x, 0, numCols) {
let t5 = (numCols*output.s0.y) + output.s0.x
output[t5] = select(flags[output.s0.y] == (uint8)64, (uint16)0, input1[output.s0.x]*input2[t5])
}
}
}
manualHoist=true
produce output {
for (output.s0.y, 0, numRows) {
allocate invalid[uint8 * 1] in Register
produce invalid {
invalid[0] = uint8((flags[output.s0.y] == (uint8)64))
}
consume invalid {
let t7 = numCols*output.s0.y
for (output.s0.x, 0, numCols) {
let t6 = output.s0.x + t7
output[t6] = select(uint1(invalid[0]), (uint16)0, input1[output.s0.x]*input2[t6])
}
}
free invalid
}
}
Consider the fragment of a rotate-and-crop pipeline seen here:
If you execute this with HL_DEBUG_CODEGEN=1, you'll get something like:
If you examine this closely, there are things that could be hoisted out of both loops, but are not. e.g.,
t209
is loop-invariant but is recalculated inside they
loop.t153
is also loop-invariant but is recalculated inside the inner loop.The cause of this is not clear to me from brief experimentation -- it appears that the logic in
LiftLoopInvariants::visit_lets
is deliberately conservative -- but there's clearly room for improvement here. (@abadams suggests that perhaps letstmts that don't depend on the loop variable should be lifted in a prepass, which might be an easy experiment for someone to try)