CliMA / ClimaCore.jl

CliMA model dycore
https://clima.github.io/ClimaCore.jl/dev
Apache License 2.0
86 stars 8 forks source link

Improve CUDA kernels #1128

Open simonbyrne opened 1 year ago

simonbyrne commented 1 year ago

Using fortran style 1D indexing on the parent, with any required assertions done upstream, might be easiest for some kernels. E.g.:

function Base.copyto!(
    dest::IJFH{S, Nij},
    bc::Union{IJFH{S, Nij, A}, Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}}},
) where {S, Nij, A <: CUDA.CuArray}
    nitems = length(parent(dest))
    max_threads = 256 # can be higher if conditions permit
    nthreads = min(max_threads, nitems)
    nblocks = cld(nitems, nthreads)
    pdest, pbc = parent(dest), parent(bc)
    CUDA.@cuda threads = (nthreads) blocks = (nblocks) knl_copyto!(pdest, pbc)
    return dest
end

function knl_copyto!(dest, src)
    nitems = length(dest)
    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x

    if gidx < nitems
        @inbounds dest[gidx] = p_src[gidx]
    end
    return nothing
end

_Originally posted by @sriharshakandala in https://github.com/CliMA/ClimaCore.jl/pull/767#discussion_r1106263606_

sriharshakandala commented 7 months ago

We can try

cartidx = CartesianIndices(dest)[gidx]