Closed nullplay closed 2 weeks ago
This is the Halide stmt with the schedule above.
gpu_block<CUDA> (out.s0.j.jo.block_id_y, 0, t52) {
gpu_block<CUDA> (out.s0.i.io.block_id_x, 0, t53) {
gpu_thread<CUDA> (.thread_id_y, 0, 16) {
gpu_thread<CUDA> (.thread_id_x, 0, 16) {
let out.s0.j.ji.base.s = min(out.s0.j.jo.block_id_y*16, out.extent.1 + -16)
allocate prod.0[float32 * 1]
allocate B_im_global_wrapper$0.2[int32 * 8]
allocate A_im_global_wrapper$0.1[float32 * 8]
let out.s0.i.ii.base.s = min(out.s0.i.io.block_id_x*16, out.extent.0 + -16)
produce prod {
prod.0[0] = 0.000000f
let t58 = (((out.s0.j.ji.base.s + t55) + .thread_id_y)*B.stride.1) - B.min.0
let t57 = (out.s0.i.ii.base.s + t54) + .thread_id_x
for (prod.s1.r12$x.ko, 0, 32) {
produce A_im_global_wrapper$0 {
let t59 = prod.s1.r12$x.ko*8
for (A_im_global_wrapper$0.s0._1.rebased, 0, 8) {
A_im_global_wrapper$0.1[A_im_global_wrapper$0.s0._1.rebased] = A[((A_im_global_wrapper$0.s0._1.rebased + t59)*A.stride.1) + t57]
}
}
produce B_im_global_wrapper$0 {
let t60 = (prod.s1.r12$x.ko*8) + t58
for (B_im_global_wrapper$0.s0._0.rebased, 0, 8) {
B_im_global_wrapper$0.2[B_im_global_wrapper$0.s0._0.rebased] = B[B_im_global_wrapper$0.s0._0.rebased + t60]
}
}
consume B_im_global_wrapper$0 {
consume A_im_global_wrapper$0 {
for (prod.s1.r12$x.ki, 0, 8) {
prod.0[0] = prod.0[0] + (A_im_global_wrapper$0.1[prod.s1.r12$x.ki]*float32(B_im_global_wrapper$0.2[prod.s1.r12$x.ki]))
}
}
}
}
free B_im_global_wrapper$0.2
free A_im_global_wrapper$0.1
}
consume prod {
out[((((out.min.1 + out.s0.j.ji.base.s) + .thread_id_y)*out.stride.1) + (out.s0.i.ii.base.s - (out.min.1*out.stride.1))) + .thread_id_x] = prod.0[0]
}
free prod.0
}
}
You need to schedule the loading of the data to be parallelized by the threads:
A.in_().compute_at(prod, ko)
B.in_().compute_at(prod, ko)
Should become something like:
A.in_().compute_at(prod, ko).gpu_threads(hl._0)
B.in_().compute_at(prod, ko).gpu_threads(hl._0)
Where _0
, _1
, _2
, ... are special variable in Halide used to refer to dimensions of an input buffer (as opposed to a Func where you can name the dimensions with variables).
I haven't fully studied your code, you might need _1
in some cases. Try and see what the stmt looks like, and you'll figure out how this directive interacts with the data loading.
Hi, I'm trying to cooperatively fetch the tiles of the input matrices for matrix multiplication in CUDA.
Below is my hand-written CUDA kernel, which I want to reproduce using Halide:
And this is the Halide kernel I tried to copy this cuda implementation.
However, this schedule ends up allocating A and B per thread, not in the shared memory. Is there a way to cooperatively load A and B into shared memory during the
ko
loop?I tried
A.in_().compute_at(prod, ko).store_in(hl.MemoryType.GPUShared)
andA.in_().compute_at(prod, ko).store_at(out, ji)
, but neither produced the desired code. The second case also resulted in a compiler error.