GPUCompiler error when running 3D diffusion example on the GPU

muendlein commented 2 years ago

While trying out the provided example, I receive the following error (Julia 1.8):

ERROR: MethodError: no method matching return_types(::GPUArrays.var"#5#6", ::Type{Tuple{CUDA.CuKernelContext, CUDA.CuDeviceArray{Float64, 3, 1}, Float64}}, ::GPUCompiler.GPUInterpreter)
Closest candidates are:
  return_types(::Any, ::Any; world, interp) at reflection.jl:1294
  return_types(::Any) at reflection.jl:1294

luraess commented 2 years ago

Thanks for reporting @muendlein! Could you be more verbose on packages version you used, exact code example you run, etc? That would make it easier to locate the issue.

muendlein commented 2 years ago

The following code is run with the error being triggered by @zeros(nx, ny, nz);

const USE_GPU = true
using ParallelStencil
using ParallelStencil.FiniteDifferences3D
@static if USE_GPU
    @init_parallel_stencil(CUDA, Float64, 3);
else
    @init_parallel_stencil(Threads, Float64, 3);
end

@parallel function diffusion3D_step!(T2, T, Ci, lam, dt, dx, dy, dz)
    @inn(T2) = @inn(T) + dt*(lam*@inn(Ci)*(@d2_xi(T)/dx^2 + @d2_yi(T)/dy^2 + @d2_zi(T)/dz^2));
    return
end

function diffusion3D()
# Physics
lam        = 1.0;                                        # Thermal conductivity
cp_min     = 1.0;                                        # Minimal heat capacity
lx, ly, lz = 10.0, 10.0, 10.0;                           # Length of domain in dimensions x, y and z.

# Numerics
nx, ny, nz = 256, 256, 256;                              # Number of gridpoints dimensions x, y and z.
nt         = 100;                                        # Number of time steps
dx         = lx/(nx-1);                                  # Space step in x-dimension
dy         = ly/(ny-1);                                  # Space step in y-dimension
dz         = lz/(nz-1);                                  # Space step in z-dimension

# Array initializations
T   = @zeros(nx, ny, nz);
T2  = @zeros(nx, ny, nz);
Ci  = @zeros(nx, ny, nz);

# Initial conditions (heat capacity and temperature with two Gaussian anomalies each)
Ci .= 1.0./( cp_min .+ Data.Array([5*exp(-(((ix-1)*dx-lx/1.5))^2-(((iy-1)*dy-ly/2))^2-(((iz-1)*dz-lz/1.5))^2) +
                                   5*exp(-(((ix-1)*dx-lx/3.0))^2-(((iy-1)*dy-ly/2))^2-(((iz-1)*dz-lz/1.5))^2) for ix=1:size(T,1), iy=1:size(T,2), iz=1:size(T,3)]) )
T  .= Data.Array([100*exp(-(((ix-1)*dx-lx/2)/2)^2-(((iy-1)*dy-ly/2)/2)^2-(((iz-1)*dz-lz/3.0)/2)^2) +
                   50*exp(-(((ix-1)*dx-lx/2)/2)^2-(((iy-1)*dy-ly/2)/2)^2-(((iz-1)*dz-lz/1.5)/2)^2) for ix=1:size(T,1), iy=1:size(T,2), iz=1:size(T,3)])
T2 .= T;                                                 # Assign also T2 to get correct boundary conditions.

# Time loop
dt = min(dx^2,dy^2,dz^2)*cp_min/lam/8.1;                 # Time step for the 3D Heat diffusion
for it = 1:nt
    @parallel diffusion3D_step!(T2, T, Ci, lam, dt, dx, dy, dz);
    T, T2 = T2, T;
end

end

diffusion3D()

Full stack trace:

Stacktrace:
  [1] check_method(job::GPUCompiler.CompilerJob)
    @ GPUCompiler \.julia\packages\GPUCompiler\I9fZc\src\validation.jl:19 
  [2] macro expansion
    @ \.julia\packages\TimerOutputs\4yHI4\src\TimerOutput.jl:253 [inlined]
  [3] macro expansion
    @ \.julia\packages\GPUCompiler\I9fZc\src\driver.jl:89 [inlined]       
  [4] emit_julia(job::GPUCompiler.CompilerJob)
    @ GPUCompiler \.julia\packages\GPUCompiler\I9fZc\src\utils.jl:64      
  [5] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA \.julia\packages\CUDA\5jdFl\src\compiler\execution.jl:324
  [6] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler \.julia\packages\GPUCompiler\I9fZc\src\cache.jl:90
  [7] cufunction(f::GPUArrays.var"#5#6", tt::Type{Tuple{CUDA.CuKernelContext, CUDA.CuDeviceArray{Float64, 3, 1}, Float64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA \.julia\packages\CUDA\5jdFl\src\compiler\execution.jl:297
  [8] cufunction
    @ \.julia\packages\CUDA\5jdFl\src\compiler\execution.jl:290 [inlined]
  [9] macro expansion
    @ \.julia\packages\CUDA\5jdFl\src\compiler\execution.jl:102 [inlined]
 [10] #launch_heuristic#282
    @ \.julia\packages\CUDA\5jdFl\src\gpuarrays.jl:17 [inlined]
 [11] gpu_call(::GPUArrays.var"#5#6", ::CUDA.CuArray{Float64, 3, CUDA.Mem.DeviceBuffer}, ::Float64; target::CUDA.CuArray{Float64, 3, CUDA.Mem.DeviceBuffer}, elements::Nothing, threads::Nothing, 
blocks::Nothing, name::Nothing)
    @ GPUArrays \.julia\packages\GPUArrays\fqD8z\src\device\execution.jl:61
 [12] gpu_call
    @ \.julia\packages\GPUArrays\fqD8z\src\device\execution.jl:34 [inlined]
 [13] fill!(A::CUDA.CuArray{Float64, 3, CUDA.Mem.DeviceBuffer}, x::Float64)
    @ GPUArrays \.julia\packages\GPUArrays\fqD8z\src\host\construction.jl:14
 [14] zeros(::Type, ::Int64, ::Vararg{Int64})
    @ CUDA \.julia\packages\CUDA\5jdFl\src\array.jl:551
 [15] diffusion3D()
    @ Main \Desktop\julia\stencil_test_2.jl:29
 [16] top-level scope
    @ \Desktop\julia\stencil_test_2.jl:49

Installed packages:

⌅ [052768ef] CUDA v3.8.5 [4d7a3746] ImplicitGlobalGrid v0.12.0 [da04e1cc] MPI v0.19.2 [94395366] ParallelStencil v0.6.0 [91a5bcdd] Plots v1.31.7

CUDA Version: "11.6.0"

Please let me know if additional information are required.

muendlein commented 2 years ago

Using the following package versions does work without an error:

[052768ef] CUDA v3.12.0 ⌃ [4d7a3746] ImplicitGlobalGrid v0.11.0 ⌅ [da04e1cc] MPI v0.18.2 [94395366] ParallelStencil v0.6.0 [91a5bcdd] Plots v1.31.7

luraess commented 2 years ago

I was gonna suggest you to try to update the pkgs. Glad it worked!

omlins / ParallelStencil.jl

GPUCompiler error when running 3D diffusion example on the GPU #64