Improve CUDA kernels - Githubissues

Using fortran style 1D indexing on the parent, with any required assertions done upstream, might be easiest for some kernels. E.g.:

function Base.copyto!(
    dest::IJFH{S, Nij},
    bc::Union{IJFH{S, Nij, A}, Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}}},
) where {S, Nij, A <: CUDA.CuArray}
    nitems = length(parent(dest))
    max_threads = 256 # can be higher if conditions permit
    nthreads = min(max_threads, nitems)
    nblocks = cld(nitems, nthreads)
    pdest, pbc = parent(dest), parent(bc)
    CUDA.@cuda threads = (nthreads) blocks = (nblocks) knl_copyto!(pdest, pbc)
    return dest
end

function knl_copyto!(dest, src)
    nitems = length(dest)
    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x

    if gidx < nitems
        @inbounds dest[gidx] = p_src[gidx]
    end
    return nothing
end

_Originally posted by @sriharshakandala in https://github.com/CliMA/ClimaCore.jl/pull/767#discussion_r1106263606_

CliMA / ClimaCore.jl

Improve CUDA kernels #1128