Closed ChrisRackauckas closed 3 years ago
I think I have similar issues when directly using CUDAnative instead of GPUifyLoops.
using GPUifyLoops, CUDAnative, CuArrays
using BenchmarkTools
#slicing does not work
function main1!(x_out, x_in)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
for i = index:stride:size(x_in,2)
@inbounds @views begin
x_out[:,i] = x_in[:,i] # .= also does not work
end
end
return nothing
end
x_in = cu(rand(2,100_000_000))
x_out = similar(x_in)
@launch CUDA() threads=1024 blocks=16 main1!(x_out,x_in)
Similarly vchuravy/GPUifyLoops.jl#87 also fails without using GPUifyLoops. And I am assuming #41 also.
# @inbounds has to be used twice
function dynamics!(x_out,x_in)
@inbounds begin
x_out[1] = x_in[1] + x_in[2]
x_out[2] = 1.1*x_in[2]
end
return nothing
end
function dynamics_no_inbounds!(x_out,x_in)
x_out[1] = x_in[1] + x_in[2]
x_out[2] = 1.1*x_in[2]
return nothing
end
function main2!(x_out, x_in)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
for i = index:stride:size(x_in,2)
@inbounds @views begin
dynamics!(x_out[:,i],x_in[:,i])
end
end
return nothing
end
function main3!(x_out, x_in)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
for i = index:stride:size(x_in,2)
@inbounds @views begin
dynamics_no_inbounds!(x_out[:,i],x_in[:,i])
end
end
return nothing
end
x_in = cu(rand(2,100_000_000))
x_out = similar(x_in)
@launch CUDA() threads=1024 blocks=16 main2!(x_out,x_in)
# @btime CuArrays.@sync begin @launch CUDA() threads=1024 blocks=16 main2!(x_out,x_in) end #20 ms (GPUifyloops is 110ms)
@launch CUDA() threads=1024 blocks=16 main3!(x_out,x_in)
Fixed with KernelAbstractions.jl
@vchuravy do you know about this one?