Open Sbozzolo opened 5 days ago
@Sbozzolo and I chatted offline about this, here's a recap:
In this case, the threads/blocks are being passed to auto_launch!
, and auto = false
, so it's an issue with https://github.com/CliMA/ClimaCore.jl/blob/ad457270882fddc58fc76c72e84b432e49c130ec/ext/cuda/data_layouts_copyto.jl#L52-L67. We could try out the new/generic version I implemented recently (https://github.com/CliMA/ClimaCore.jl/blob/ad457270882fddc58fc76c72e84b432e49c130ec/ext/cuda/data_layouts_copyto.jl#L113), I think that would work.
Some of the new kernels were merged because we simply didn't have support, and the remaining ones didn't replace the old ones because they were slightly slower than the old ones.
if this error is only happening for the single column, then maybe we can just replace out a single column case. I see no issue with this.
launch(::CUDA.CuFunction, ::CUDA.KernelState, ::ClimaCore.DataLayouts.VIJFH
Ah, this is actually a VIJFH
datalayout! Perhaps unfortunately, ClimaAtmos's column model is actually still an extruded FD space: https://github.com/CliMA/ClimaAtmos.jl/blob/77d7f0beeb300f1d4c2147158c66c6fe69ba8c84/src/solver/type_getters.jl#L203-L225. I wonder if that should be updated. IIRC, there is a CPU performance implication of having a horizontal space for purely single columns (this is especially likely true on the GPU).
Using the patch in #1859 does get us past the posted error, but then we fail on dispatch (which is an easy fix):
ERROR: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore should be avoided.
If you want to allow scalar iteration, use `allowscalar` or `@allowscalar`
to enable scalar iteration globally or for the operations in question.
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] errorscalar(op::String)
@ GPUArraysCore ~/.julia/packages/GPUArraysCore/GMsgk/src/GPUArraysCore.jl:155
[3] _assertscalar(op::String, behavior::GPUArraysCore.ScalarIndexing)
@ GPUArraysCore ~/.julia/packages/GPUArraysCore/GMsgk/src/GPUArraysCore.jl:128
[4] assertscalar(op::String)
@ GPUArraysCore ~/.julia/packages/GPUArraysCore/GMsgk/src/GPUArraysCore.jl:116
[5] getindex(A::CUDA.CuArray{Float32, 5, CUDA.Mem.DeviceBuffer}, I::Int64)
@ GPUArrays ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:48
[6] scalar_getindex(::CUDA.CuArray{Float32, 5, CUDA.Mem.DeviceBuffer}, ::Int64, ::Vararg{Int64})
@ GPUArrays ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:34
[7] _getindex
@ ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:17 [inlined]
[8] getindex
@ ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:15 [inlined]
[9] getindex
@ ./subarray.jl:290 [inlined]
[10] _getindex
@ ./abstractarray.jl:1341 [inlined]
[11] getindex
@ ./abstractarray.jl:1291 [inlined]
[12] get_struct
@ ~/CliMA/ClimaCore.jl/src/DataLayouts/struct.jl:218 [inlined]
[13] macro expansion
@ ~/CliMA/ClimaCore.jl/src/DataLayouts/struct.jl:200 [inlined]
[14] macro expansion
@ ./none:0 [inlined]
[15] get_struct
@ ./none:0 [inlined]
[16] getindex
@ ~/CliMA/ClimaCore.jl/src/DataLayouts/DataLayouts.jl:1026 [inlined]
[17] getindex
@ ~/CliMA/ClimaCore.jl/src/DataLayouts/DataLayouts.jl:1033 [inlined]
[18] _broadcast_getindex
@ ./broadcast.jl:662 [inlined]
[19] _getindex
@ ./broadcast.jl:706 [inlined]
[20] _getindex
@ ./broadcast.jl:705 [inlined]
[21] _broadcast_getindex
@ ./broadcast.jl:681 [inlined]
[22] getindex
@ ./broadcast.jl:636 [inlined]
[23] copyto!(dest::ClimaCore.DataLayouts.VF{…}, bc::Base.Broadcast.Broadcasted{…})
@ ClimaCore.DataLayouts ~/CliMA/ClimaCore.jl/src/DataLayouts/broadcast.jl:576
[24] copyto!(dest::ClimaCore.DataLayouts.VIJFH{…}, bc::Base.Broadcast.Broadcasted{…})
@ ClimaCore.DataLayouts ~/CliMA/ClimaCore.jl/src/DataLayouts/broadcast.jl:604
[25] copyto!
@ ~/CliMA/ClimaCore.jl/src/Fields/broadcast.jl:149 [inlined]
[26] materialize!
@ ./broadcast.jl:914 [inlined]
[27] materialize!
@ ./broadcast.jl:911 [inlined]
[28] macro expansion
@ ~/CliMA/ClimaAtmos.jl/src/callbacks/callbacks.jl:81 [inlined]
[29] rrtmgp_model_callback!(integrator::ClimaTimeSteppers.DistributedODEIntegrator{…})
@ ClimaAtmos ~/.julia/packages/NVTX/pfSOQ/src/macro.jl:194
[30] AtmosCallback
@ ~/CliMA/ClimaAtmos.jl/src/solver/types.jl:455 [inlined]
[31] #239
@ ~/CliMA/ClimaAtmos.jl/src/callbacks/callback_helpers.jl:49 [inlined]
[32] initialize!(u::ClimaCore.Fields.FieldVector{…}, t::Float32, integrator::ClimaTimeSteppers.DistributedODEIntegrator{…}, any_modified::Bool, c::SciMLBase.DiscreteCallback{…}, cs::SciMLBase.DiscreteCallback{…})
@ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:13
[33] initialize!(::ClimaCore.Fields.FieldVector{…}, ::Float32, ::ClimaTimeSteppers.DistributedODEIntegrator{…}, ::Bool, ::SciMLBase.DiscreteCallback{…}, ::SciMLBase.DiscreteCallback{…}, ::Vararg{…})
@ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14
[34] initialize!(::ClimaCore.Fields.FieldVector{…}, ::Float32, ::ClimaTimeSteppers.DistributedODEIntegrator{…}, ::Bool, ::SciMLBase.DiscreteCallback{…}, ::SciMLBase.DiscreteCallback{…}, ::Vararg{…})
@ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14
[35] initialize!(::ClimaCore.Fields.FieldVector{…}, ::Float32, ::ClimaTimeSteppers.DistributedODEIntegrator{…}, ::Bool, ::SciMLBase.DiscreteCallback{…}, ::SciMLBase.DiscreteCallback{…}, ::Vararg{…})
@ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14
[36] initialize! (repeats 2 times)
@ ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14 [inlined]
[37] initialize!(cb::SciMLBase.CallbackSet{…}, u::ClimaCore.Fields.FieldVector{…}, t::Float32, integrator::ClimaTimeSteppers.DistributedODEIntegrator{…})
@ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:7
[38] __init(::SciMLBase.ODEProblem{…}, ::ClimaTimeSteppers.IMEXAlgorithm{…}; dt::Float32, tstops::Tuple{}, saveat::Vector{…}, save_everystep::Bool, callback::SciMLBase.CallbackSet{…}, advance_to_tstop::Bool, save_func::ClimaTimeSteppers.var"#36#38", dtchangeable::Bool, stepstop::Int64, kwargs::@Kwargs{…})
@ ClimaTimeSteppers ~/.julia/packages/ClimaTimeSteppers/UtGqn/src/integrators.jl:124
[39] __init
@ ~/.julia/packages/ClimaTimeSteppers/UtGqn/src/integrators.jl:68 [inlined]
[40] #init_call#40
@ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:530 [inlined]
[41] init_call
@ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:503 [inlined]
[42] #init_up#43
@ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:564 [inlined]
[43] init_up
@ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:551 [inlined]
[44] #init#41
@ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:544 [inlined]
[45] macro expansion
@ ~/CliMA/ClimaAtmos.jl/src/solver/type_getters.jl:708 [inlined]
[46] macro expansion
@ ./timing.jl:503 [inlined]
[47] macro expansion
@ ~/CliMA/ClimaAtmos.jl/src/utils/utilities.jl:331 [inlined]
[48] get_simulation(config::ClimaAtmos.AtmosConfig{Float32, ClimaParams.ParamDict{…}, Dict{…}, ClimaComms.SingletonCommsContext{…}, Tuple{…}})
@ ClimaAtmos ~/CliMA/ClimaAtmos.jl/src/solver/type_getters.jl:707
[49] top-level scope
@ REPL[7]:1
[50] top-level scope
@ ~/.julia/packages/CUDA/XUdwt/src/initialization.jl:209
Some type information was truncated. Use `show(err)` to see complete types.
There seems to be an issue with thread allocations on GPUs.
Steps to reproduce, in ClimaAtmos, on
clima
Error:
Maybe related to
auto_launch!
?