Closed mcourteaux closed 1 week ago
Funnily enough, in that context the simplifier is smart enough to simplify t78 > 0 to true, because it wouldn't be in the loop if the extent were less than 1, but it currently only simplifies loop_var < loop_max if loop_max is a constant. We could address this with another scoped_truth here for the non-constant case: https://github.com/halide/Halide/blob/main/src/Simplify_Stmts.cpp#L243
Nice, I gave it a go, and it works! Will PR it! :smile:
Went from this:
to:
Okay, so this bit me again. Look at the following:
Now instead of if (loop_var < extent)
it's if (loop_var + 1 <= extent)
. How would you make sure that all variants of this same thing are actually being optimized, @abadams ? The scoped_truth we added doesn't seem to work here because of the +1 in combination with =.
Well, the simplifier could normalize x + 1 <= y to x < y for ints.
But where are these ifs coming from? It might be better to not inject them in the first place.
This time, something like:
output.specialize(output.dim(0).extent() >= 32 && output.dim(1).extent() >= 16)
.gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
.vectorize(xi, 4) // deleting this get's rid of one if like this, but introduces another
.gpu_blocks(c) // block over color-channels
;
I wanted to make a specialized version of this pipelines for image sizes that will fit at least one tile.
I tried this but it didn't repro:
#include "Halide.h"
using namespace Halide;
int main(int argc, char **argv) {
Func f;
Var x, y, c, xo, yo, xi, yi;
f(x, y, c) = 0.f;
f.specialize(f.output_buffer().dim(0).extent() >= 32 && f.output_buffer().dim(1).extent() >= 16)
.gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
.vectorize(xi, 4) // irrelevant here
.gpu_blocks(c) // block over color-channels
;
f.compile_jit(Target{"host-cuda"});
return 0;
}
Is there some simple repro you can share?
I'll construct a repro later (11pm here). How do you check the conceptual stmt file from this? With HL_DEBUG_CODEGEN?
I have a repro:
#include "Halide.h"
#include <iostream>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
Func f("f");
f(x, y) = x * y;
Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
f
.compute_root()
.specialize(f.output_buffer().dim(0).extent() >= 32)
.never_partition_all()
.gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
.vectorize(xi, 4, TailStrategy::ShiftInwards)
;
// We should add this:
//f.compile_to_conceptual_stmt("random_ifs.stmt", {}, StmtOutputFormat::Text, Halide::Target{"host-cuda"});
// But we don't have it yet, so, here is a hack:
Target t{"host-cuda"};
Halide::Module m = Halide::Pipeline(f).compile_to_module({}, "", t);
m.compile({
{ Halide::OutputFileType::conceptual_stmt, "random_ifs.stmt" }
});
printf("Success!\n");
return 0;
}
produces random_ifs.stmt
:
assert(reinterpret<uint64>((struct halide_buffer_t *)f.buffer) != (uint64)0, halide_error_buffer_argument_is_null("f"))
let f = (void *)_halide_buffer_get_host((struct halide_buffer_t *)f.buffer)
let f.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)f.buffer)
let f.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)f.buffer)
let f.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)f.buffer, 0)
let f.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)f.buffer, 0)
let f.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)f.buffer, 0)
let f.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)f.buffer, 1)
let f.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)f.buffer, 1)
let f.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)f.buffer, 1)
let f.extent.1.required = f.extent.1 - select(f.extent.0 < 32, 0, min(f.extent.1, 16) + -16)
let f.min.1.required.s = select(f.extent.0 < 32, 0, min(f.extent.1, 16) + -16)
if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)f.buffer)) {
(struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)f.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)f.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 0, 32, 2, (struct halide_dimension_t *)make_struct(f.min.0, f.extent.0, 1, 0, f.min.1 + f.min.1.required.s, f.extent.1.required, f.extent.0, 0), (uint64)0)
}
if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)f.buffer)) {
assert(f.type == (uint32)73728, halide_error_bad_type("Output buffer f", f.type, (uint32)73728))
assert(f.dimensions == 2, halide_error_bad_dimensions("Output buffer f", f.dimensions, 2))
assert(0 <= f.extent.0, halide_error_buffer_extents_negative("Output buffer f", 0, f.extent.0))
assert((0 <= f.min.1.required.s) && ((f.extent.1.required + f.min.1.required.s) <= f.extent.1), let t31 = (f.min.1 + f.min.1.required.s) in halide_error_access_out_of_bounds("Output buffer f", 1, t31, (t31 + f.extent.1.required) + -1, f.min.1, (f.extent.1 + f.min.1) + -1))
assert(0 <= f.extent.1, halide_error_buffer_extents_negative("Output buffer f", 1, f.extent.1))
assert(f.stride.0 == 1, halide_error_constraint_violated("f.stride.0", f.stride.0, "1", 1))
let f.total_extent.1 = int64(f.extent.1)*int64(f.extent.0)
assert((uint64)abs(int64(f.extent.0)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("f", (uint64)abs(int64(f.extent.0)), (uint64)2147483647))
assert((uint64)abs(int64(f.extent.1)*int64(f.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("f", (uint64)abs(int64(f.extent.1)*int64(f.stride.1)), (uint64)2147483647))
assert(f.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("f", f.total_extent.1, (int64)2147483647))
assert(f != reinterpret<(void *)>((uint64)0), halide_error_host_is_null("Output buffer f"))
produce f {
if (32 <= f.extent.0) {
let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)f.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
let t27 = (f.extent.0 + 31)/32
let t26 = (f.extent.1 + 15)/16
gpu_block<CUDA> (f.s0.y.yo.block_id_y, 0, t26) {
gpu_block<CUDA> (f.s0.x.xo.block_id_x, 0, t27) {
if (((f.s0.y.yo.block_id_y + 1) <= t26) && ((f.s0.x.xo.block_id_x + 1) <= t27)) {
gpu_thread<CUDA> (.thread_id_y, 0, 16) {
gpu_thread<CUDA> (.thread_id_x, 0, 8) {
let f.s0.y.yi.base.s = min(f.s0.y.yo.block_id_y*16, f.extent.1 + -16)
let f.s0.x.xi.base.s = min(f.s0.x.xo.block_id_x*32, f.extent.0 + -32)
let t25 = (f.min.1 + f.s0.y.yi.base.s) + .thread_id_y
f[ramp((.thread_id_x*4) + ((f.stride.1*t25) + (f.s0.x.xi.base.s - (f.min.1*f.stride.1))), 1, 4)] = ramp(((.thread_id_x*4) + (f.min.0 + f.s0.x.xi.base.s))*t25, t25, 4)
}
}
}
}
}
_halide_buffer_set_device_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
} else {
let halide_copy_to_host_result = halide_copy_to_host((struct halide_buffer_t *)f.buffer)
assert(halide_copy_to_host_result == 0, halide_copy_to_host_result)
for (f.s0.y.rebased, 0, f.extent.1) {
let t30 = f.s0.y.rebased*f.stride.1
let t29 = f.min.1 + f.s0.y.rebased
for (f.s0.x.rebased, 0, f.extent.0) {
f[f.s0.x.rebased + t30] = (f.min.0 + f.s0.x.rebased)*t29
}
}
_halide_buffer_set_host_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
}
}
}
UPDATE: Removing the vectorize
directive gives one condition less in the if:
produce f {
if (32 <= f.extent.0) {
let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)f.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
let t18 = (f.extent.0 + 31)/32
let t17 = (f.extent.1 + 15)/16
gpu_block<CUDA> (f.s0.y.yo.block_id_y, 0, t17) {
if ((f.s0.y.yo.block_id_y + 1) <= t17) {
gpu_block<CUDA> (f.s0.x.xo.block_id_x, 0, t18) {
gpu_thread<CUDA> (.thread_id_y, 0, 16) {
gpu_thread<CUDA> (.thread_id_x, 0, 32) {
let f.s0.y.yi.base = min(f.s0.y.yo.block_id_y*16, f.extent.1 + -16)
let f.s0.x.xi.base = min(f.s0.x.xo.block_id_x*32, f.extent.0 + -32)
let t16 = .thread_id_y + f.s0.y.yi.base
f[((f.stride.1*t16) + f.s0.x.xi.base) + .thread_id_x] = (.thread_id_x + f.s0.x.xi.base)*t16
}
}
}
}
}
_halide_buffer_set_device_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
} else {
let halide_copy_to_host_result = halide_copy_to_host((struct halide_buffer_t *)f.buffer)
assert(halide_copy_to_host_result == 0, halide_copy_to_host_result)
for (f.s0.y, 0, f.extent.1) {
let t19 = f.s0.y*f.stride.1
for (f.s0.x, 0, f.extent.0) {
f[f.s0.x + t19] = f.s0.x*f.s0.y
}
}
_halide_buffer_set_host_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
}
}
It's a bug. The if statements shouldn't be there. If you specialize and then have more splits on one side of the specialization than the other, lowering incorrectly thinks the Func is part of a compute_with fused group and throws in if statements for safety.
I have no clue what's going on.
I'm seeing this:![image](https://github.com/halide/Halide/assets/845012/4c9a8458-6481-4551-9bbf-15f50db99f70)
Look at the
if (t84)
and trace thet84
backwards:t84 = loop_var < t78
loop_var
goes from0
tot78
So t84 should always be false. It's the first time I'm seeing something like this.
I went back randomly to d9668c5bc to test and compare, but it produces the same result. I don't have time to make a repro right now, but wanted to document this already.
I'm not 100% sure, but I believe LLVM does notice this, and just removes the entire else-branch of this code. (It produces a non-vectorized else-branch).