Kernel launcher - Githubissues

ivan-pi commented 8 months ago

A colleague of mine has used the variadic templates of C++ to mimic a kernel launcher:

template<typename F, typename... Ts>
void launch2D(const dim3 & numBlocks, const dim3 & blockDim, F & f, Ts&&... ts)
{
    for (int bx=0;bx<numBlocks.x;++bx)
    for (int by=0;by<numBlocks.y;++by)
    {
        #pragma omp parallel num_threads(blockDim.x*blockDim.y)
        {
            const int tn = omp_get_thread_num();
            const int tx = tn % blockDim.y;
            const int ty = tn / blockDim.y;
            f(numBlocks, blockDim, {bx,by}, {tx,ty}, ts...);
        }
    }
}

// ...

    const dim3 threadsperBlock {BlockSize,BlockSize};
    const dim3 numBlocks{N/threadsperBlock.x,N/threadsperBlock.y};
    launch2D(numBlocks, threadsperBlock, matrix_multiplication_kernel<BlockSize>, a.data(), b.data(), c.data(), N);

This is kind of like the CUDA triple chevron

launch2d<<<numBlocks,threadsperBlock>>>(matrix_multiplication_kernel<BlockSize>, a.data(), b.data(), c.data(), N)

I suppose it's possible to do something similar with Fypp, Fortran and OpenMP/OpenACC/CUDA. I came up with the following solution, but it lacks encapsulation:

#:def LAUNCH1D(kernel, n)
block
integer :: i
    !$omp parallel for simd
    do i = 1, ${n}$
        $:kernel
    end do
    !$omp end parallel for simd
end block
#:enddef

#:call LAUNCH1D
y(i) = a*x(i) + y(i)
#:nextarg
n
#:endcall

ivan-pi commented 8 months ago

Maybe something like this could work:

#:mute

#:def comma(a,b)
${a}$, ${b}$
#:enddef

#:def unpack(*pos)
  #:if len(pos) > 1
    #:set res = comma(pos[0],unpack(*pos[1:]))
    $:res
  #:elif len(pos) == 1
    $:pos[0]
  #:endif
#:enddef

#:def launch1d(name,kernel,params,args)
subroutine launch_${name}$(numBlocks, blockDim, ${args}$)
    use omp_lib
    type(dim3), value :: numBlocks, blockDim

! vvvv params vvvv
$:params
! ^^^^^^^^^^^^^^^

    type(dim3) :: blockIdx, threadIdx
    integer :: bx, tx

do bx = 1, numBlocks%x
    blockIdx = dim3(bx)
!$omp parallel num_threads(blockDim%x)
    tx = omp_get_thread_num()
    threadIdx = dim3(tx)

! vvv kernel part vvv
$:kernel
! ^^^^^^^^^^^^^^^^^^^

!$omp end parallel
end do

end subroutine
#:enddef

#:def launch(numBlocks,tpb,name,*args)
#:set arg_list = unpack(*args)
call launch_${name}$ (dim3(${numBlocks}$), dim3(${tpb}$), &
    ${arg_list}$)
#:enddef

#:endmute

! --- program starts here ---

program main
implicit none

integer, parameter :: n = 100
real :: a, x(n), y(n)

type :: dim3
    integer :: x, y=1, z=1
end type

type(dim3) :: threadsPerBlock, numBlocks

a = 1.0
threadsPerBlock = dim3( n )
numBlocks = dim3( n / 64 + 1 )

! Kernel definitions are found in the contains section
call launch_axpy(threadsPerBlock,numBlocks, &
    n, a, x, y)

! Macro to help launching
@:launch( n/64 + 1, 64, axpy, n, a, x, y)

contains

#:block launch1d(name="axpy")
#:contains args
    n, a, x, y
#:contains params
    integer, value :: n
    real, value :: a
    real, intent(in) :: x(n)
    real, intent(inout) :: y(n)
    integer :: i
#:contains kernel
    i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
    if (i < n) then
        y(i) = a*x(i) + y(i)
    end if
#:endblock

end program

ivan-pi commented 8 months ago

Another possibility would be to follow a more SYCL-like approach:

#:if defined('OMP_KERNEL')

#:def parallel_for(name,kernel,params,args)
! --- OMP Kernel ---
subroutine ${name}$(${args}$)
$:params
!$omp parallel for
do i = 1, n
    $:kernel
end do
!$omp end parallel for
end subroutine
! ------------------
#:enddef

#:else 
#:def parallel_for(name,kernel,params,args)
! --- CUDA Kernel ---
subroutine ${name}$(${args}$)
$:params
call ${name}$_d<<<n/64 + 1,64>>>(${args}$)
end subroutine

attribute(device) subroutine ${name}$_d(${args}$)
$:params
    i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
$:kernel
end subroutine
! -------------------
#:enddef
#:endif

#:block parallel_for(name="axpy")
#:contains args
    n, a, x, y
#:contains params
    integer, intent(in) :: n
    real, intent(in) :: a, x(n)
    real, intent(inout) :: y(n)
    integer :: i
#:contains kernel
    y(i) = a*x(i) + y(i)
#:endblock

Depending on the definition, this generates:

! --- OMP Kernel ---
subroutine axpy(    n, a, x, y)
    integer, intent(in) :: n
    real, intent(in) :: a, x(n)
    real, intent(inout) :: y(n)
    integer :: i
!$omp parallel for
do i = 1, n
    y(i) = a*x(i) + y(i)
end do
!$omp end parallel for
end subroutine
! ------------------

or

! --- CUDA Kernel ---
subroutine axpy(    n, a, x, y)
    integer, intent(in) :: n
    real, intent(in) :: a, x(n)
    real, intent(inout) :: y(n)
    integer :: i
call axpy_d<<<n/64 + 1,64>>>(    n, a, x, y)
end subroutine

attribute(device) subroutine axpy_d(    n, a, x, y)
    integer, intent(in) :: n
    real, intent(in) :: a, x(n)
    real, intent(inout) :: y(n)
    integer :: i
    i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
    y(i) = a*x(i) + y(i)
end subroutine
! -------------------

I think it has some potential. :)

aradi commented 8 months ago

@ivan-pi That's a very nice idea, thanks for posting!

Actually, you can reduce the redundancy somewhat, if you extract the argument list from the argument definition block. If one defines each argument on its own line and defines all properties (apart of the name) as attribute, it is easily possible (see argument_list() macro):

#:mute

#! Extracts the list of the arguments from a block of argument defintions
#:def argument_list(argdefs)
#:set arglines = argdefs.split("\n")
#:set args = [argline.split("::")[-1].strip() for argline in arglines]
$:", ".join(args)
#:enddef

#:if defined('OMP_KERNEL')

#:def parallel_for(name, kernel, arguments)
! --- OMP Kernel ---
subroutine ${name}$(${argument_list(arguments)}$)
    $:arguments
    !$omp parallel for
    do i = 1, n
        $:kernel
    end do
    !$omp end parallel for
end subroutine
! ------------------
#:enddef

#:else 
#:def parallel_for(name, kernel, arguments)
! --- CUDA Kernel ---
subroutine ${name}$(${argument_list(arguments)}$)
$:arguments
    call ${name}$_d<<<n/64 + 1,64>>>(${argument_list(arguments)}$)
end subroutine

attribute(device) subroutine ${name}$_d(${argument_list(arguments)}$)
$:arguments
    i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
$:kernel
end subroutine
! -------------------
#:enddef
#:endif

#:endmute

#:block parallel_for(name="axpy")
#:contains arguments
    #! put one argument per line
    #! use '::' to separate names from attributes
    #! define every attribute before the '::'
    #! (e.g. use 'real, dimension(n) :: x' instead of 'real :: x(n)')
    integer, intent(in) :: n
    real, intent(in) :: a
    real, dimension(n), intent(in) :: x
    real, dimension(n), intent(inout) :: y
    integer :: i
#:contains kernel
    y(i) = a*x(i) + y(i)
#:endblock

ivan-pi commented 8 months ago

Excellent! Thanks @aradi.

I was missing a rule to help extract dimensions out of the declarations. Ideally, the range of the kernel (as in the 1d-3d (or higher) index space) should be configurable and countable, so that we can generate nested OpenMP loops with a collapse clause, or a different block division in the CUDA kernel.

The basic version should be functional, and then **kwargs could be used to add clauses specific to each programming model, e.g. omp_clause=schedule(dynamic), or reduction specifiers. Also things like these data locality clauses may be needed:

!$omp declare target (x,y)
!$acc declare present(x,y)
!$cuf attribute(device) :: x, y

When it comes to memory allocation, I was thinking it would be cleanest to have a device_allocate() subroutine, that returns a pointer, and calls cudaMalloc/accMalloc/omp_target_alloc. Or alternatively a macro which adds the model-specific directives to the declarations/allocate statements.

It would be interesting to see how far can one get with this approach. My idea was roughly to follow what SYCL has: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke

ivan-pi commented 8 months ago

Probably some implicit rules might be needed, e.g. i, j, k are the indexes into the kernel range. My last attempt at GEMM looked like this:

#:block parallel_for(d=2,name="my_gemm")
#:contains args
    k, a, b, c
#:contains range
    m, n
#:contains params
    integer, value :: k
    real, intent(in) :: a(m,n), b(n,k)
    real, intent(inout) :: c(m,k)
    integer :: i
#:contains kernel
    c(i,j) = dot_product(a(i,:),b(:,j))
#:endblock

and I used an explicit dimension argument d, to tell Fypp how many loops are needed. The template body then looked like this:

!$omp parallel for collapse(${d}$)
#:if d == 1
do i = 1, dims(1)
    $:kernel
end do
#:elif d == 2
do j = 1, dims(2)
    do i = 1, dims(1)
        $:kernel
    end do
end do
#:endif

aradi commented 8 months ago

I'd rather suggest to avoid implicit rules, if possible. I think, by using block and generating the variables on the fly, one can get away without. Also, why not generating the d loops automatically instead of having the #:if-clause for every possible case?

#:set kernel = "! Some kernel"
#:set ndims = 3

block
integer :: ${", ".join([f"i{idim}" for idim in range(1, ndims + 1)])}$
#:for idim in range(1, ndims + 1)
do i${idim}$ = 1, dims(${idim}$)
#:endfor
$:kernel
#:for _ in range(ndims)
end do
#:endfor
end block

aradi / fypp

Kernel launcher #39