halide / Halide

a language for fast, portable data-parallel computation
https://halide-lang.org
Other
5.78k stars 1.07k forks source link

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

Open mcourteaux opened 4 days ago

mcourteaux commented 4 days ago

When --for some reason-- an allocation group for fused storage for multiple Funcs that originally are intended to go in GPUShared gets lifted out of the GPU-block loops, and sits in Heap memory instead, the profiling injection logic assumed that this buffer came from a function with the same name. This buffer was incorrectly determined to be on the stack, as it ignored the custom_new and custom_free attributes of the Allocate node.

Consider this example (also included as a new test):

#include "Halide.h"

using namespace Halide;

int main(int argc, char *argv[]) {

    Target t = get_jit_target_from_environment();
    if (!t.has_gpu_feature()) {
        printf("[SKIP] GPU not enabled\n");
        return 0;
    }

    Var x{"x"}, y{"y"};

    Func f1{"f1"}, f2{"f2"};
    f1(x, y) = cast<float>(x + y);
    f2(x, y) = f1(x, y) * 2;

    Func result{"result"};
    result(x, y) = f2(x, y);

    Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
    result
        .compute_root()
        .gpu_tile(x, y, xo, yo, xi, yi, 16, 16)
        .reorder(xi, yi, xo, yo)
        ;

    f2.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    f1.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    result.print_loop_nest();

    t.set_feature(Target::Profile); // Make sure profiling is enabled!
    result.compile_jit(t);

    printf("Success!\n");
    return 0;
}

Produces the following Stmt right before the Profiling pass:

assert(reinterpret<uint64>((struct halide_buffer_t *)result.buffer) != (uint64)0, halide_error_buffer_argument_is_null("result"))
let result = (void *)_halide_buffer_get_host((struct halide_buffer_t *)result.buffer)
let result.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)result.buffer)
let result.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)result.buffer)
let result.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 0)
let result.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 0)
let result.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 0)
let result.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 1)
let result.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 1)
let result.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 1)
if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)result.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)result.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 2, (struct halide_dimension_t *)make_struct((min(result.extent.0, 16) + result.min.0) + -16, max(result.extent.0, 16), 1, 0, (min(result.extent.1, 16) + result.min.1) + -16, max(result.extent.1, 16), max(result.extent.0, 16), 0), (uint64)0)
}
if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 assert(result.type == (uint32)73730, halide_error_bad_type("Output buffer result", result.type, (uint32)73730))
 assert(result.dimensions == 2, halide_error_bad_dimensions("Output buffer result", result.dimensions, 2))
 assert((16 <= result.extent.0) && (((max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -16) <= (result.extent.0 + result.min.0)), halide_error_access_out_of_bounds("Output buffer result", 0, (min(result.extent.0, 16) + result.min.0) + -16, (max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -17, result.min.0, (result.extent.0 + result.min.0) + -1))
 assert((16 <= result.extent.1) && (((max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -16) <= (result.extent.1 + result.min.1)), halide_error_access_out_of_bounds("Output buffer result", 1, (min(result.extent.1, 16) + result.min.1) + -16, (max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -17, result.min.1, (result.extent.1 + result.min.1) + -1))
 assert(result.stride.0 == 1, halide_error_constraint_violated("result.stride.0", result.stride.0, "1", 1))
 let result.total_extent.1 = int64(result.extent.1)*int64(result.extent.0)
 assert((uint64)abs(int64(result.extent.0)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.0)), (uint64)2147483647))
 assert((uint64)abs(int64(result.extent.1)*int64(result.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.1)*int64(result.stride.1)), (uint64)2147483647))
 assert(result.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("result", result.total_extent.1, (int64)2147483647))
 profiling_enable_instance_marker()
 produce result {
  let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)result.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
  allocate allocgroup__f1$0.0__f2$0.1.buffer[float32 * 1]
   custom_new { let t13 = (struct halide_dimension_t *)make_struct(0, 512, 1, 0, 0, (result.extent.0 + 15)/16, 512, 0, 0, (result.extent.1 + 15)/16, ((result.extent.0 + 15)/16)*512, 0) in (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)alloca(size_of_halide_buffer_t()), t13, reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 3, t13, (uint64)0) }
   custom_delete { halide_device_free_as_destructor(allocgroup__f1$0.0__f2$0.1.buffer); }
  let t14 = halide_device_malloc(allocgroup__f1$0.0__f2$0.1.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(t14 == 0, t14)
  let allocgroup__f1$0.0__f2$0.1 = (void *)_halide_buffer_get_device(allocgroup__f1$0.0__f2$0.1.buffer)
  gpu_block<CUDA> (result.s0.y.yo.block_id_y, 0, (result.extent.1 + 15)/16) {
   gpu_block<CUDA> (result.s0.x.xo.block_id_x, 0, (result.extent.0 + 15)/16) {
    gpu_thread<CUDA> (.thread_id_y, 0, 16) {
     gpu_thread<CUDA> (.thread_id_x, 0, 16) {
      let result.s0.y.yi.base.s = min(result.s0.y.yo.block_id_y*16, result.extent.1 + -16)
      let result.s0.x.xi.base.s = min(result.s0.x.xo.block_id_x*16, result.extent.0 + -16)
      produce f1$0 {
       allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)] = float32((((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x) + ((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)))
      }
      gpu_thread_barrier(1)
      produce f2$0 {
       consume f1$0 {
        allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256] = allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)]*2.000000f
       }
      }
      gpu_thread_barrier(1)
      consume f2$0 {
       result[((((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)*result.stride.1) + ((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x)) - ((result.min.1*result.stride.1) + result.min.0)] = allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256]
      }
     }
    }
   }
  }
  free allocgroup__f1$0.0__f2$0.1.buffer
  _halide_buffer_set_device_dirty((struct halide_buffer_t *)result.buffer, (uint1)1)
 }
}

Notice how the allocgroup__f1$0.0__f2$0.1.buffer is outside of the outermost GPU-block loop. When this buffer didn't get lifted out of the kernel, Profiling wasn't an issue, as the profiler doesn't traverse the IR into GPU loops.

The offending line was:

https://github.com/halide/Halide/blob/461c12871f336fe6f57b55d6a297f13ef209161b/src/Profiling.cpp#L274

When instrumenting the allocate node. The node is incorrectly determined to be on_stack=true.

This PR checks if there is a custom_new and overrides that it is on the stack to false.

@abadams I wonder if we can't simply rely on Allocate::MemoryType to determine on_stack, or is that still Auto at that moment?