mcourteaux commented 1 week ago

I have no clue what's going on.

I'm seeing this:

Look at the if (t84) and trace the t84 backwards:

t84 = loop_var < t78
loop_var goes from 0 to t78

So t84 should always be false. It's the first time I'm seeing something like this.

I went back randomly to d9668c5bc to test and compare, but it produces the same result. I don't have time to make a repro right now, but wanted to document this already.

I'm not 100% sure, but I believe LLVM does notice this, and just removes the entire else-branch of this code. (It produces a non-vectorized else-branch).

abadams commented 1 week ago

Funnily enough, in that context the simplifier is smart enough to simplify t78 > 0 to true, because it wouldn't be in the loop if the extent were less than 1, but it currently only simplifies loop_var < loop_max if loop_max is a constant. We could address this with another scoped_truth here for the non-constant case: https://github.com/halide/Halide/blob/main/src/Simplify_Stmts.cpp#L243

mcourteaux commented 1 week ago

Nice, I gave it a go, and it works! Will PR it! :smile:

Went from this:

to:

mcourteaux commented 1 week ago

Okay, so this bit me again. Look at the following:

Now instead of if (loop_var < extent) it's if (loop_var + 1 <= extent). How would you make sure that all variants of this same thing are actually being optimized, @abadams ? The scoped_truth we added doesn't seem to work here because of the +1 in combination with =.

abadams commented 1 week ago

Well, the simplifier could normalize x + 1 <= y to x < y for ints.

But where are these ifs coming from? It might be better to not inject them in the first place.

mcourteaux commented 1 week ago

This time, something like:

output.specialize(output.dim(0).extent() >= 32 && output.dim(1).extent() >= 16)
  .gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
  .vectorize(xi, 4) // deleting this get's rid of one if like this, but introduces another
  .gpu_blocks(c) // block over color-channels
  ;

I wanted to make a specialized version of this pipelines for image sizes that will fit at least one tile.

abadams commented 1 week ago

I tried this but it didn't repro:

#include "Halide.h"

using namespace Halide;

int main(int argc, char **argv) {
    Func f;
    Var x, y, c, xo, yo, xi, yi;

    f(x, y, c) = 0.f;

    f.specialize(f.output_buffer().dim(0).extent() >= 32 && f.output_buffer().dim(1).extent() >= 16)
        .gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
        .vectorize(xi, 4)  // irrelevant here
        .gpu_blocks(c)     // block over color-channels
        ;

    f.compile_jit(Target{"host-cuda"});

    return 0;
}

Is there some simple repro you can share?

mcourteaux commented 1 week ago

I'll construct a repro later (11pm here). How do you check the conceptual stmt file from this? With HL_DEBUG_CODEGEN?

mcourteaux commented 1 week ago

I have a repro:

#include "Halide.h"
#include <iostream>
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {

    Var x("x"), y("y");
    Func f("f");

    f(x, y) = x * y;

    Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
    f
      .compute_root()
      .specialize(f.output_buffer().dim(0).extent() >= 32)
      .never_partition_all()
      .gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
      .vectorize(xi, 4, TailStrategy::ShiftInwards)
      ;

    // We should add this:
    //f.compile_to_conceptual_stmt("random_ifs.stmt", {}, StmtOutputFormat::Text, Halide::Target{"host-cuda"});
    // But we don't have it yet, so, here is a hack:
    Target t{"host-cuda"};
    Halide::Module m = Halide::Pipeline(f).compile_to_module({}, "", t);
    m.compile({
            { Halide::OutputFileType::conceptual_stmt, "random_ifs.stmt" }
            });

    printf("Success!\n");
    return 0;
}

produces random_ifs.stmt:

assert(reinterpret<uint64>((struct halide_buffer_t *)f.buffer) != (uint64)0, halide_error_buffer_argument_is_null("f"))
let f = (void *)_halide_buffer_get_host((struct halide_buffer_t *)f.buffer)
let f.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)f.buffer)
let f.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)f.buffer)
let f.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)f.buffer, 0)
let f.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)f.buffer, 0)
let f.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)f.buffer, 0)
let f.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)f.buffer, 1)
let f.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)f.buffer, 1)
let f.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)f.buffer, 1)
let f.extent.1.required = f.extent.1 - select(f.extent.0 < 32, 0, min(f.extent.1, 16) + -16)
let f.min.1.required.s = select(f.extent.0 < 32, 0, min(f.extent.1, 16) + -16)
if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)f.buffer)) {
 (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)f.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)f.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 0, 32, 2, (struct halide_dimension_t *)make_struct(f.min.0, f.extent.0, 1, 0, f.min.1 + f.min.1.required.s, f.extent.1.required, f.extent.0, 0), (uint64)0)
}
if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)f.buffer)) {
 assert(f.type == (uint32)73728, halide_error_bad_type("Output buffer f", f.type, (uint32)73728))
 assert(f.dimensions == 2, halide_error_bad_dimensions("Output buffer f", f.dimensions, 2))
 assert(0 <= f.extent.0, halide_error_buffer_extents_negative("Output buffer f", 0, f.extent.0))
 assert((0 <= f.min.1.required.s) && ((f.extent.1.required + f.min.1.required.s) <= f.extent.1), let t31 = (f.min.1 + f.min.1.required.s) in halide_error_access_out_of_bounds("Output buffer f", 1, t31, (t31 + f.extent.1.required) + -1, f.min.1, (f.extent.1 + f.min.1) + -1))
 assert(0 <= f.extent.1, halide_error_buffer_extents_negative("Output buffer f", 1, f.extent.1))
 assert(f.stride.0 == 1, halide_error_constraint_violated("f.stride.0", f.stride.0, "1", 1))
 let f.total_extent.1 = int64(f.extent.1)*int64(f.extent.0)
 assert((uint64)abs(int64(f.extent.0)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("f", (uint64)abs(int64(f.extent.0)), (uint64)2147483647))
 assert((uint64)abs(int64(f.extent.1)*int64(f.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("f", (uint64)abs(int64(f.extent.1)*int64(f.stride.1)), (uint64)2147483647))
 assert(f.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("f", f.total_extent.1, (int64)2147483647))
 assert(f != reinterpret<(void *)>((uint64)0), halide_error_host_is_null("Output buffer f"))
 produce f {
  if (32 <= f.extent.0) {
   let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)f.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
   assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
   let t27 = (f.extent.0 + 31)/32
   let t26 = (f.extent.1 + 15)/16
   gpu_block<CUDA> (f.s0.y.yo.block_id_y, 0, t26) {
    gpu_block<CUDA> (f.s0.x.xo.block_id_x, 0, t27) {
     if (((f.s0.y.yo.block_id_y + 1) <= t26) && ((f.s0.x.xo.block_id_x + 1) <= t27)) {
      gpu_thread<CUDA> (.thread_id_y, 0, 16) {
       gpu_thread<CUDA> (.thread_id_x, 0, 8) {
        let f.s0.y.yi.base.s = min(f.s0.y.yo.block_id_y*16, f.extent.1 + -16)
        let f.s0.x.xi.base.s = min(f.s0.x.xo.block_id_x*32, f.extent.0 + -32)
        let t25 = (f.min.1 + f.s0.y.yi.base.s) + .thread_id_y
        f[ramp((.thread_id_x*4) + ((f.stride.1*t25) + (f.s0.x.xi.base.s - (f.min.1*f.stride.1))), 1, 4)] = ramp(((.thread_id_x*4) + (f.min.0 + f.s0.x.xi.base.s))*t25, t25, 4)
       }
      }
     }
    }
   }
   _halide_buffer_set_device_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
  } else {
   let halide_copy_to_host_result = halide_copy_to_host((struct halide_buffer_t *)f.buffer)
   assert(halide_copy_to_host_result == 0, halide_copy_to_host_result)
   for (f.s0.y.rebased, 0, f.extent.1) {
    let t30 = f.s0.y.rebased*f.stride.1
    let t29 = f.min.1 + f.s0.y.rebased
    for (f.s0.x.rebased, 0, f.extent.0) {
     f[f.s0.x.rebased + t30] = (f.min.0 + f.s0.x.rebased)*t29
    }
   }
   _halide_buffer_set_host_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
  }
 }
}

UPDATE: Removing the vectorize directive gives one condition less in the if:

 produce f {
  if (32 <= f.extent.0) {
   let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)f.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
   assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
   let t18 = (f.extent.0 + 31)/32
   let t17 = (f.extent.1 + 15)/16
   gpu_block<CUDA> (f.s0.y.yo.block_id_y, 0, t17) {
    if ((f.s0.y.yo.block_id_y + 1) <= t17) {
     gpu_block<CUDA> (f.s0.x.xo.block_id_x, 0, t18) {
      gpu_thread<CUDA> (.thread_id_y, 0, 16) {
       gpu_thread<CUDA> (.thread_id_x, 0, 32) {
        let f.s0.y.yi.base = min(f.s0.y.yo.block_id_y*16, f.extent.1 + -16)
        let f.s0.x.xi.base = min(f.s0.x.xo.block_id_x*32, f.extent.0 + -32)
        let t16 = .thread_id_y + f.s0.y.yi.base
        f[((f.stride.1*t16) + f.s0.x.xi.base) + .thread_id_x] = (.thread_id_x + f.s0.x.xi.base)*t16
       }
      }
     }
    }
   }
   _halide_buffer_set_device_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
  } else {
   let halide_copy_to_host_result = halide_copy_to_host((struct halide_buffer_t *)f.buffer)
   assert(halide_copy_to_host_result == 0, halide_copy_to_host_result)
   for (f.s0.y, 0, f.extent.1) {
    let t19 = f.s0.y*f.stride.1
    for (f.s0.x, 0, f.extent.0) {
     f[f.s0.x + t19] = f.s0.x*f.s0.y
    }
   }
   _halide_buffer_set_host_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
  }
 }

abadams commented 1 week ago

It's a bug. The if statements shouldn't be there. If you specialize and then have more splits on one side of the specialization than the other, lowering incorrectly thinks the Func is part of a compute_with fused group and throws in if statements for safety.

abadams commented 1 week ago

halide / Halide

Simplifier fails to optimize for-loop endpoint. #8303

8317