Open ivan-pi opened 8 months ago
Maybe something like this could work:
#:mute
#:def comma(a,b)
${a}$, ${b}$
#:enddef
#:def unpack(*pos)
#:if len(pos) > 1
#:set res = comma(pos[0],unpack(*pos[1:]))
$:res
#:elif len(pos) == 1
$:pos[0]
#:endif
#:enddef
#:def launch1d(name,kernel,params,args)
subroutine launch_${name}$(numBlocks, blockDim, ${args}$)
use omp_lib
type(dim3), value :: numBlocks, blockDim
! vvvv params vvvv
$:params
! ^^^^^^^^^^^^^^^
type(dim3) :: blockIdx, threadIdx
integer :: bx, tx
do bx = 1, numBlocks%x
blockIdx = dim3(bx)
!$omp parallel num_threads(blockDim%x)
tx = omp_get_thread_num()
threadIdx = dim3(tx)
! vvv kernel part vvv
$:kernel
! ^^^^^^^^^^^^^^^^^^^
!$omp end parallel
end do
end subroutine
#:enddef
#:def launch(numBlocks,tpb,name,*args)
#:set arg_list = unpack(*args)
call launch_${name}$ (dim3(${numBlocks}$), dim3(${tpb}$), &
${arg_list}$)
#:enddef
#:endmute
! --- program starts here ---
program main
implicit none
integer, parameter :: n = 100
real :: a, x(n), y(n)
type :: dim3
integer :: x, y=1, z=1
end type
type(dim3) :: threadsPerBlock, numBlocks
a = 1.0
threadsPerBlock = dim3( n )
numBlocks = dim3( n / 64 + 1 )
! Kernel definitions are found in the contains section
call launch_axpy(threadsPerBlock,numBlocks, &
n, a, x, y)
! Macro to help launching
@:launch( n/64 + 1, 64, axpy, n, a, x, y)
contains
#:block launch1d(name="axpy")
#:contains args
n, a, x, y
#:contains params
integer, value :: n
real, value :: a
real, intent(in) :: x(n)
real, intent(inout) :: y(n)
integer :: i
#:contains kernel
i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
if (i < n) then
y(i) = a*x(i) + y(i)
end if
#:endblock
end program
Another possibility would be to follow a more SYCL-like approach:
#:if defined('OMP_KERNEL')
#:def parallel_for(name,kernel,params,args)
! --- OMP Kernel ---
subroutine ${name}$(${args}$)
$:params
!$omp parallel for
do i = 1, n
$:kernel
end do
!$omp end parallel for
end subroutine
! ------------------
#:enddef
#:else
#:def parallel_for(name,kernel,params,args)
! --- CUDA Kernel ---
subroutine ${name}$(${args}$)
$:params
call ${name}$_d<<<n/64 + 1,64>>>(${args}$)
end subroutine
attribute(device) subroutine ${name}$_d(${args}$)
$:params
i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
$:kernel
end subroutine
! -------------------
#:enddef
#:endif
#:block parallel_for(name="axpy")
#:contains args
n, a, x, y
#:contains params
integer, intent(in) :: n
real, intent(in) :: a, x(n)
real, intent(inout) :: y(n)
integer :: i
#:contains kernel
y(i) = a*x(i) + y(i)
#:endblock
Depending on the definition, this generates:
! --- OMP Kernel ---
subroutine axpy( n, a, x, y)
integer, intent(in) :: n
real, intent(in) :: a, x(n)
real, intent(inout) :: y(n)
integer :: i
!$omp parallel for
do i = 1, n
y(i) = a*x(i) + y(i)
end do
!$omp end parallel for
end subroutine
! ------------------
or
! --- CUDA Kernel ---
subroutine axpy( n, a, x, y)
integer, intent(in) :: n
real, intent(in) :: a, x(n)
real, intent(inout) :: y(n)
integer :: i
call axpy_d<<<n/64 + 1,64>>>( n, a, x, y)
end subroutine
attribute(device) subroutine axpy_d( n, a, x, y)
integer, intent(in) :: n
real, intent(in) :: a, x(n)
real, intent(inout) :: y(n)
integer :: i
i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
y(i) = a*x(i) + y(i)
end subroutine
! -------------------
I think it has some potential. :)
@ivan-pi That's a very nice idea, thanks for posting!
Actually, you can reduce the redundancy somewhat, if you extract the argument list from the argument definition block. If one defines each argument on its own line and defines all properties (apart of the name) as attribute, it is easily possible (see argument_list()
macro):
#:mute
#! Extracts the list of the arguments from a block of argument defintions
#:def argument_list(argdefs)
#:set arglines = argdefs.split("\n")
#:set args = [argline.split("::")[-1].strip() for argline in arglines]
$:", ".join(args)
#:enddef
#:if defined('OMP_KERNEL')
#:def parallel_for(name, kernel, arguments)
! --- OMP Kernel ---
subroutine ${name}$(${argument_list(arguments)}$)
$:arguments
!$omp parallel for
do i = 1, n
$:kernel
end do
!$omp end parallel for
end subroutine
! ------------------
#:enddef
#:else
#:def parallel_for(name, kernel, arguments)
! --- CUDA Kernel ---
subroutine ${name}$(${argument_list(arguments)}$)
$:arguments
call ${name}$_d<<<n/64 + 1,64>>>(${argument_list(arguments)}$)
end subroutine
attribute(device) subroutine ${name}$_d(${argument_list(arguments)}$)
$:arguments
i = blockDim%x*(blockIdx%x - 1) + threadIdx%x
$:kernel
end subroutine
! -------------------
#:enddef
#:endif
#:endmute
#:block parallel_for(name="axpy")
#:contains arguments
#! put one argument per line
#! use '::' to separate names from attributes
#! define every attribute before the '::'
#! (e.g. use 'real, dimension(n) :: x' instead of 'real :: x(n)')
integer, intent(in) :: n
real, intent(in) :: a
real, dimension(n), intent(in) :: x
real, dimension(n), intent(inout) :: y
integer :: i
#:contains kernel
y(i) = a*x(i) + y(i)
#:endblock
Excellent! Thanks @aradi.
I was missing a rule to help extract dimensions out of the declarations. Ideally, the range of the kernel (as in the 1d-3d (or higher) index space) should be configurable and countable, so that we can generate nested OpenMP loops with a collapse clause, or a different block division in the CUDA kernel.
The basic version should be functional, and then **kwargs
could be used to add clauses specific to each programming model, e.g. omp_clause=schedule(dynamic)
, or reduction specifiers. Also things like these data locality clauses may be needed:
!$omp declare target (x,y)
!$acc declare present(x,y)
!$cuf attribute(device) :: x, y
When it comes to memory allocation, I was thinking it would be cleanest to have a device_allocate()
subroutine, that returns a pointer, and calls cudaMalloc
/accMalloc
/omp_target_alloc
. Or alternatively a macro which adds the model-specific directives to the declarations/allocate statements.
It would be interesting to see how far can one get with this approach. My idea was roughly to follow what SYCL has: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke
Probably some implicit rules might be needed, e.g. i
, j
, k
are the indexes into the kernel range. My last attempt at GEMM looked like this:
#:block parallel_for(d=2,name="my_gemm")
#:contains args
k, a, b, c
#:contains range
m, n
#:contains params
integer, value :: k
real, intent(in) :: a(m,n), b(n,k)
real, intent(inout) :: c(m,k)
integer :: i
#:contains kernel
c(i,j) = dot_product(a(i,:),b(:,j))
#:endblock
and I used an explicit dimension argument d
, to tell Fypp how many loops are needed. The template body then looked like this:
!$omp parallel for collapse(${d}$)
#:if d == 1
do i = 1, dims(1)
$:kernel
end do
#:elif d == 2
do j = 1, dims(2)
do i = 1, dims(1)
$:kernel
end do
end do
#:endif
I'd rather suggest to avoid implicit rules, if possible. I think, by using block
and generating the variables on the fly, one can get away without. Also, why not generating the d
loops automatically instead of having the #:if
-clause for every possible case?
#:set kernel = "! Some kernel"
#:set ndims = 3
block
integer :: ${", ".join([f"i{idim}" for idim in range(1, ndims + 1)])}$
#:for idim in range(1, ndims + 1)
do i${idim}$ = 1, dims(${idim}$)
#:endfor
$:kernel
#:for _ in range(ndims)
end do
#:endfor
end block
A colleague of mine has used the variadic templates of C++ to mimic a kernel launcher:
This is kind of like the CUDA triple chevron
I suppose it's possible to do something similar with Fypp, Fortran and OpenMP/OpenACC/CUDA. I came up with the following solution, but it lacks encapsulation: