mcabbott / Tullio.jl

MIT License
598 stars 26 forks source link

GPU Kernel Compilation Failed with Interpolations #167

Open roflmaostc opened 1 year ago

roflmaostc commented 1 year ago

Hi,

Following Pluto code. It's an issue with Interpolations, I checked that. But custom CUDA Kernels work with interpolations.

The CPU version runs fine (not faster than three nested threaded for loop unfortunately)

Any idea what's going on? I was hoping to avoid writing the custom CUDA Kernel. I'm tagging @maleadt since he suggested me to use Tullio :laughing:


# ╔═╡ 6c535b92-cf32-11ed-3b4b-b5530c170589
using Tullio, LoopVectorization, CUDA, Adapt, KernelAbstractions, CUDAKernels, Interpolations

# ╔═╡ 633d1ddc-131d-465a-ac8c-c382c972dc31
function radon_tullio(I, θs, zs)
    sinogram = similar(I, length(zs), length(θs))
    fill!(sinogram, 0)

    midpoint = size(I, 1) ÷ 2 + 1

    if I isa CuArray
        I_int = linear_interpolation((1:size(I, 1), 1:size(I, 2)), I) # critical
        I_int = adapt(CuArray{Float32}, I_int) # critical
    else
        I_int = linear_interpolation((1:size(I, 1), 1:size(I, 2)), I)
    end
    sθ = sin.(θs)
    cθ = cos.(θs)

    @tullio  sinogram[is, iθ] = @inbounds(begin
        x =  zs[z] * sθ[iθ] + zs[is] * cθ[iθ] + midpoint
        y = -zs[z] * cθ[iθ] + zs[is] * sθ[iθ] + midpoint
        I_int(y, x) # critical
    end)

    return sinogram ./ maximum(sinogram)
end

# ╔═╡ 93a73653-3973-45f4-b06c-830cdc94b7fb
@time sinogram_t2 = radon_tullio(CUDA.rand(2,2), CuArray(range(1f0, 1.1f0, 2)), CuArray(range(-0.1f0,0.1f0,2)));

# ╔═╡ 404be392-362d-404f-ac53-274592d5641c
@time sinogram_t = radon_tullio(rand(3,3), range(1f0, 1.1f0, 2), range(-0.1f0,0.1f0,2));

KernelError: passing and using non-bitstype argument

Argument 1 to your kernel function is of type Main.var"workspace#28".var"#gpu_##🇨🇺#604#3"{Int64}, which is not isbits:

.I_int is of type Core.Box which is not isbits.

.contents is of type Any which is not isbits.

    check_invocation(::GPUCompiler.CompilerJob)@validation.jl:88
    macro expansion@driver.jl:154[inlined]
    macro expansion@TimerOutput.jl:253[inlined]
    macro expansion@driver.jl:152[inlined]
    var"#emit_julia#112"(::Bool, ::typeof(GPUCompiler.emit_julia), ::GPUCompiler.CompilerJob)@utils.jl:83
    emit_julia@utils.jl:77[inlined]
    cufunction_compile(::GPUCompiler.CompilerJob, ::LLVM.Context)@execution.jl:359
    #221@execution.jl:354[inlined]
    JuliaContext(::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{Main.var"workspace#28".var"#gpu_##🇨🇺#604#3"{Int64}, Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, CUDA.CuDeviceMatrix{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}, Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}, Nothing, Bool}}}})@driver.jl:76
    cufunction_compile(::GPUCompiler.CompilerJob)@execution.jl:353
    cached_compilation(::Dict{UInt64, Any}, ::GPUCompiler.CompilerJob, ::typeof(CUDA.cufunction_compile), ::typeof(CUDA.cufunction_link))@cache.jl:90
    var"#cufunction#218"(::Nothing, ::Bool, ::Base.Pairs{Symbol, Nothing, Tuple{Symbol}, NamedTuple{(:maxthreads,), Tuple{Nothing}}}, ::typeof(CUDA.cufunction), ::Main.var"workspace#28".var"#gpu_##🇨🇺#604#3"{Int64}, ::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, CUDA.CuDeviceMatrix{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}, Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}, Nothing, Bool}})@execution.jl:306
    macro expansion@execution.jl:102[inlined]
    var"#_#23"(::Tuple{Int64, Int64}, ::CUDAKernels.CudaEvent, ::Nothing, ::Function, ::KernelAbstractions.Kernel{CUDAKernels.CUDADevice{false, false}, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, Main.var"workspace#28".var"#gpu_##🇨🇺#604#3"{Int64}}, ::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ::Vararg{Any})@CUDAKernels.jl:283
    𝒜𝒸𝓉!@[Other: 1172](http://localhost:1238/edit?id=6c535b8a-cf32-11ed-00ae-4d51e59bd750#)[inlined]
    𝒜𝒸𝓉!@[Other: 1169](http://localhost:1238/edit?id=6c535b8a-cf32-11ed-00ae-4d51e59bd750#)[inlined]
    threader(::Main.var"workspace#28".var"#𝒜𝒸𝓉!#1"{Int64}, ::Type{CUDA.CuArray{Float32, N, CUDA.Mem.DeviceBuffer} where N}, ::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ::Tuple{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ::Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, ::Tuple{Base.OneTo{Int64}}, ::Function, ::Int64, ::Nothing)@eval.jl:104
    macro expansion@[Other: 1004](http://localhost:1238/edit?id=6c535b8a-cf32-11ed-00ae-4d51e59bd750#)[inlined]
    radon_tullio(::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})@[Other: 17](http://localhost:1238/edit?id=6c535b8a-cf32-11ed-00ae-4d51e59bd750#)
    macro expansion@[Local: 262](http://localhost:1238/edit?id=6c535b8a-cf32-11ed-00ae-4d51e59bd750#)[inlined]
    top-level scope@[Local: 1](http://localhost:1238/edit?id=6c535b8a-cf32-11ed-00ae-4d51e59bd750#)[inlined]
roflmaostc commented 1 year ago

Splitting it in two functions could remove the boxing:

# ╔═╡ 633d1ddc-131d-465a-ac8c-c382c972dc31
function radon_tullio(I, θs, zs)
    sinogram = similar(I, length(zs), length(θs))
    fill!(sinogram, 0)

    midpoint = size(I, 1) ÷ 2 + 1

    if I isa CuArray
        I_2 = linear_interpolation((1:size(I, 1), 1:size(I, 2)), I)
        I_int = adapt(CuArray{Float32}, I_2)
        @show "cuda"
    else
        I_int = linear_interpolation((1:size(I, 1), 1:size(I, 2)), I)
    end
    sθ = sin.(θs)
    cθ = cos.(θs)

    t(θs, zs, sinogram, midpoint, I_int)
    return sinogram ./ maximum(sinogram)
end

# ╔═╡ 4e0f0aa8-614d-4bc3-bc93-94ba1235b1d7
function t(θs, zs, sinogram, midpoint, I_int)

    sθ = sin.(θs)
    cθ = cos.(θs)

    @tullio  sinogram[is, iθ] = @inbounds(begin
        x =  zs[z] * sθ[iθ] + zs[is] * cθ[iθ] + midpoint
        y = -zs[z] * cθ[iθ] + zs[is] * sθ[iθ] + midpoint
        I_int(y, x)
    end)
end
roflmaostc commented 1 year ago

So it works now but performance it not better than the threaded nested for loops on the CPU.

Is an 200x200 array and ranges of 200 to small?