WaterLily-jl / ParametricBodies.jl

MIT License
8 stars 6 forks source link

mapping between 3D-2D breaks on the GPU #16

Open marinlauber opened 1 week ago

marinlauber commented 1 week ago

Using the mapping to extrude/revolve a 2D curve in 3D space via the mapping function breaks on the GPU.

See the script below with an example that runs on the CPU but fails on the GPU


using WaterLily
using StaticArrays
using ParametricBodies
using CUDA

function make_sim(L=2^5;Re=250,mem=Array,U=1,T=Float32)
    # Define simulation size, geometry dimensions, viscosity
    center,r = SA_F32[L,L,L], L÷2

    # NURBS points, weights and knot vector for a circle
    cps = SA_F32[1 1 0 -1 -1 -1  0  1 1
                 0 1 1  1  0 -1 -1 -1 0]*r
    weights = SA_F32[1.,√2/2,1.,√2/2,1.,√2/2,1.,√2/2,1.]
    knots =   SA_F32[0,0,0,1/4,1/4,1/2,1/2,3/4,3/4,1,1,1]
    curve = NurbsCurve(cps,knots,weights)

    function map(x::SVector{3,X},t::T) where {X,T} # Map the 3D space into a 2D curve 
        y = x - center
        return SA{promote_type(X,T)}[y[1],√(y[2]^2+y[3]^2)]
    end
    body = ParametricBody(curve;map,scale=1f0)
    Simulation((3L,2L,2L),(U,0,0),L;ν=U*L/Re,body,mem,T)
end

# make a simulation on the CPU
sim = make_sim(2^5;mem=Array); # works
sim = make_sim(2^5;mem=CuArray); # fails
marinlauber commented 1 week ago

The issue is linked to the measure function.

function measure(body::AbstractParametricBody,x,t;fastd²=Inf)
    # curve props and velocity in ξ-frame
    d,n,dotS = curve_props(body,x,t;fastd²)
    d^2 > fastd² && return d,zero(x),zero(x)
1   dξdt = dotS-ForwardDiff.derivative(t->body.map(x,t),t)

    # Convert to x-frame with dξ/dx⁻¹ (d has already been scaled)
2   dξdx = ForwardDiff.jacobian(x->body.map(x,t),x)
3   return (d,dξdx\n/body.scale,dξdx\dξdt)
end

First, ForwardDiff breaks on line 1 because the input is SVector{3} but the output is SVector{2}. This throws a _throw_size_mismatch(as...) @ StaticArrays error from within KernelAbstraction (in KernelAbstractions.NDIteration.DynamicCheck it seems), see below. Interestingly, line 2 does not throw an error.

Line 3 also breaks (if line 1 is commented and line 3 is return (d,dξdx\n/body.scale,zero(x))) this time with a "call through a literal pointer" error... Explicitly enforcing the size of n as n = SVector{length(x)}(dξdx\n/body.scale) results in the same error. This seems to be linked to the ldiv (\).

If line 3 is changed to return (d,zero(x),zero(x)) the code runs as expected.

The full _throw_size_mismatch error

1-element ExceptionStack:
LoadError: InvalidIRError: compiling MethodInstance for (::WaterLily.var"#gpu_##kern_#552#227"{WaterLily.var"#fill!#224"{3, Float32, Float32, Int64, ParametricBody{Float32, NurbsLocator{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, SVector{2, Float32}}, NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, ParametricBodies.var"#7#9"{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}}, var"#map#12"{SVector{3, Float32}}}, Int64}})(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(64, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, ::CuDeviceArray{Float32, 4, 1}, ::CuDeviceArray{Float32, 5, 1}, ::CuDeviceArray{Float32, 4, 1}, ::CuDeviceArray{Float32, 3, 1}, ::CartesianIndex{3}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to _throw_size_mismatch(as...) @ StaticArrays ~/.julia/packages/StaticArrays/MSJcA/src/traits.jl:116)
Stacktrace:
  [1] same_size
    @ ~/.julia/packages/StaticArrays/MSJcA/src/traits.jl:111
  [2] macro expansion
    @ ~/.julia/packages/StaticArrays/MSJcA/src/mapreduce.jl:76
  [3] _map
    @ ~/.julia/packages/StaticArrays/MSJcA/src/mapreduce.jl:42
  [4] map
    @ ~/.julia/packages/StaticArrays/MSJcA/src/mapreduce.jl:39
  [5] -
    @ ~/.julia/packages/StaticArrays/MSJcA/src/linalg.jl:16
  [6] #measure#1
    @ ~/Workspace/ParametricBodies.jl/src/ParametricBodies.jl:17
  [7] measure
    @ ~/Workspace/ParametricBodies.jl/src/ParametricBodies.jl:13
  [8] fill!
    @ ~/Workspace/WaterLily/src/Body.jl:37
  [9] macro expansion
    @ ~/Workspace/WaterLily/src/util.jl:111
 [10] gpu_
    @ ~/.julia/packages/KernelAbstractions/60cqT/src/macros.jl:95
 [11] gpu_
    @ ./none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/validation.jl:147
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:458 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:457 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/utils.jl:103
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/Y4hSX/src/utils.jl:97 [inlined]
  [7] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:136
  [8] codegen
    @ ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:115 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:111
 [10] compile
    @ ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:103 [inlined]
 [11] #1145
    @ ~/.julia/packages/CUDA/Tl08O/src/compiler/compilation.jl:254 [inlined]
 [12] JuliaContext(f::CUDA.var"#1145#1148"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:52
 [13] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/driver.jl:42
 [14] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/Tl08O/src/compiler/compilation.jl:253
 [15] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/execution.jl:237
 [16] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Y4hSX/src/execution.jl:151
 [17] macro expansion
    @ ~/.julia/packages/CUDA/Tl08O/src/compiler/execution.jl:369 [inlined]
 [18] macro expansion
    @ ./lock.jl:267 [inlined]
 [19] cufunction(f::WaterLily.var"#gpu_##kern_#552#227"{WaterLily.var"#fill!#224"{3, Float32, Float32, Int64, ParametricBody{Float32, NurbsLocator{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, SVector{2, Float32}}, NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, ParametricBodies.var"#7#9"{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}}, var"#map#12"{SVector{3, Float32}}}, Int64}}, tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{3, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(64, 1, 1)}, CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}, Nothing}}, CuDeviceArray{Float32, 4, 1}, CuDeviceArray{Float32, 5, 1}, CuDeviceArray{Float32, 4, 1}, CuDeviceArray{Float32, 3, 1}, CartesianIndex{3}}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Int64})
    @ CUDA ~/.julia/packages/CUDA/Tl08O/src/compiler/execution.jl:364
 [20] macro expansion
    @ ~/.julia/packages/CUDA/Tl08O/src/compiler/execution.jl:112 [inlined]
 [21] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.StaticSize{(64,)}, KernelAbstractions.NDIteration.DynamicSize, WaterLily.var"#gpu_##kern_#552#227"{WaterLily.var"#fill!#224"{3, Float32, Float32, Int64, ParametricBody{Float32, NurbsLocator{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, SVector{2, Float32}}, NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, ParametricBodies.var"#7#9"{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}}, var"#map#12"{SVector{3, Float32}}}, Int64}}})(::CuArray{Float32, 4, CUDA.DeviceMemory}, ::Vararg{Any}; ndrange::Tuple{Int64, Int64, Int64}, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/Tl08O/src/CUDAKernels.jl:103
 [22] (::WaterLily.var"##kern#551#225"{WaterLily.var"#fill!#224"{3, Float32, Float32, Int64, ParametricBody{Float32, NurbsLocator{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, SVector{2, Float32}}, NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, ParametricBodies.var"#7#9"{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}}, var"#map#12"{SVector{3, Float32}}}, Int64}, Flow{3, Float32, CuArray{Float32, 3, CUDA.DeviceMemory}, CuArray{Float32, 4, CUDA.DeviceMemory}, CuArray{Float32, 5, CUDA.DeviceMemory}}})(μ₀::CuArray{Float32, 4, CUDA.DeviceMemory}, μ₁::CuArray{Float32, 5, CUDA.DeviceMemory}, V::CuArray{Float32, 4, CUDA.DeviceMemory}, σ::CuArray{Float32, 3, CUDA.DeviceMemory}, ::Val{16})
    @ WaterLily ~/Workspace/WaterLily/src/util.jl:114
 [23] macro expansion
    @ ~/Workspace/WaterLily/src/util.jl:116 [inlined]
 [24] measure!(a::Flow{3, Float32, CuArray{Float32, 3, CUDA.DeviceMemory}, CuArray{Float32, 4, CUDA.DeviceMemory}, CuArray{Float32, 5, CUDA.DeviceMemory}}, body::ParametricBody{Float32, NurbsLocator{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, SVector{2, Float32}}, NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, ParametricBodies.var"#7#9"{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}}, var"#map#12"{SVector{3, Float32}}}; t::Float32, ϵ::Int64)
    @ WaterLily ~/Workspace/WaterLily/src/Body.jl:50
 [25] Simulation(dims::Tuple{Int64, Int64, Int64}, u_BC::Tuple{Int64, Int64, Int64}, L::Int64; Δt::Float64, ν::Float64, g::Nothing, U::Nothing, ϵ::Int64, perdir::Tuple{}, uλ::Nothing, exitBC::Bool, body::ParametricBody{Float32, NurbsLocator{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, SVector{2, Float32}}, NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}, ParametricBodies.var"#7#9"{NurbsCurve{2, 2, SMatrix{2, 9, Float32, 18}, SVector{12, Float32}, SVector{9, Float32}}}, var"#map#12"{SVector{3, Float32}}}, T::Type, mem::Type)
    @ WaterLily ~/Workspace/WaterLily/src/WaterLily.jl:75
 [26] make_sim(L::Int64; Re::Int64, mem::Type, U::Int64, T::Type)
    @ Main ~/Workspace/ParametricBodies.jl/example/ThreeD_Sphere.jl:23
 [27] top-level scope
    @ ~/Workspace/ParametricBodies.jl/example/ThreeD_Sphere.jl:1
 [28] eval
    @ ./boot.jl:385 [inlined]
 [29] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:2070
 [30] invokelatest(::Any, ::Any, ::Vararg{Any}; kwargs::@Kwargs{})
    @ Base ./essentials.jl:887
 [31] invokelatest(::Any, ::Any, ::Vararg{Any})
    @ Base ./essentials.jl:884
 [32] inlineeval(m::Module, code::String, code_line::Int64, code_column::Int64, file::String; softscope::Bool)
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/eval.jl:271
 [33] (::VSCodeServer.var"#69#74"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})()
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/eval.jl:181
 [34] withpath(f::VSCodeServer.var"#69#74"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams}, path::String)
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/repl.jl:276
 [35] (::VSCodeServer.var"#68#73"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})()
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/eval.jl:179
 [36] hideprompt(f::VSCodeServer.var"#68#73"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/repl.jl:38
 [37] (::VSCodeServer.var"#67#72"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})()
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/eval.jl:150
 [38] with_logstate(f::Function, logstate::Any)
    @ Base.CoreLogging ./logging.jl:515
 [39] with_logger
    @ ./logging.jl:627 [inlined]
 [40] (::VSCodeServer.var"#66#71"{VSCodeServer.ReplRunCodeRequestParams})()
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/eval.jl:263
 [41] #invokelatest#2
    @ Base ./essentials.jl:887 [inlined]
 [42] invokelatest(::Any)
    @ Base ./essentials.jl:884
 [43] (::VSCodeServer.var"#64#65")()
    @ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.124.2/scripts/packages/VSCodeServer/src/eval.jl:34
in expression starting at /home/marin/Workspace/ParametricBodies.jl/example/ThreeD_Sphere.jl:1