CliMA / ClimaCore.jl

CliMA model dycore
https://clima.github.io/ClimaCore.jl/dev
Apache License 2.0
79 stars 7 forks source link

Higher resolution column cases cannot be run on GPU #1854

Open Sbozzolo opened 5 days ago

Sbozzolo commented 5 days ago

There seems to be an issue with thread allocations on GPUs.

Steps to reproduce, in ClimaAtmos, on clima

import ClimaAtmos as CA; CA.ClimaComms.@import_required_backends
simulation = CA.get_simulation(CA.AtmosConfig("config/model_configs/single_column_radiative_equilibrium_gray.yml"))

Error:

ERROR: Number of threads in z-dimension exceeds device limit (70 > 64).
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] 
    @ CUDA ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:97
  [3] launch(::CUDA.CuFunction, ::CUDA.KernelState, ::ClimaCore.DataLayouts.VIJFH{…}, ::Base.Broadcast.Broadcasted{…}; blocks::Tuple{…}, threads::Tuple{…}, cooperative::Bool, shmem::Int64, stream::CUDA.CuStream)
    @ CUDA ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:73
  [4] launch
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:52 [inlined]
  [5] #972
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:189 [inlined]
  [6] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:149 [inlined]
  [7] macro expansion
    @ ./none:0 [inlined]
  [8] convert_arguments
    @ ./none:0 [inlined]
  [9] #cudacall#971
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:191 [inlined]
 [10] cudacall
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:187 [inlined]
 [11] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:268 [inlined]
 [12] macro expansion
    @ ./none:0 [inlined]
 [13] call
    @ ./none:0 [inlined]
 [14] (::CUDA.HostKernel{…})(::ClimaCore.DataLayouts.VIJFH{…}, ::Base.Broadcast.Broadcasted{…}; threads::Tuple{…}, blocks::Tuple{…}, kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:390
 [15] kwcall(::NamedTuple, kernel::CUDA.HostKernel, args::Vararg{Any, N}) where N
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:389 [inlined]
 [16] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:114 [inlined]
 [17] auto_launch!(f!::typeof(ClimaCoreCUDAExt.knl_copyto!), args::Tuple{…}, data::ClimaCore.DataLayouts.VIJFH{…}; auto::Bool, threads_s::Tuple{…}, blocks_s::Tuple{…}, always_inline::Bool, caller::Symbol)
    @ ClimaCoreCUDAExt ~/.julia/packages/ClimaCore/ANgUC/ext/cuda/cuda_utils.jl:58
 [18] auto_launch!
    @ ~/.julia/packages/ClimaCore/ANgUC/ext/cuda/cuda_utils.jl:38 [inlined]
 [19] copyto!
    @ ~/.julia/packages/ClimaCore/ANgUC/ext/cuda/data_layouts.jl:71 [inlined]
 [20] copy
    @ ./broadcast.jl:928 [inlined]
 [21] materialize
    @ ./broadcast.jl:903 [inlined]
 [22] _ExtrudedFiniteDifferenceGrid(horizontal_grid::ClimaCore.Grids.SpectralElementGrid2D{…}, vertical_grid::ClimaCore.Grids.FiniteDifferenceGrid{…}, hypsography::ClimaCore.Grids.Flat, global_geometry::ClimaCore.Geometry.CartesianGlobalGeometry)
    @ ClimaCore.Grids ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:102
 [23] #18
    @ ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:86 [inlined]
 [24] get!(default::ClimaCore.Grids.var"#18#19"{…}, h::Dict{…}, key::Tuple{…})
    @ Base ./dict.jl:479
 [25] ExtrudedFiniteDifferenceGrid
    @ ClimaCore.Grids ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:76 [inlined] [26] #ExtrudedFiniteDifferenceGrid#17
    @ ClimaCore.Grids ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:61 [inlined] [27] make_hybrid_spaces(h_space::ClimaCore.Spaces.SpectralElementSpace2D{…}, z_max::Float32, z_elem::Int64, z_stretch::ClimaCore.Meshes.GeneralizedExponentialStretching{…}; surface_warp::Nothing, topo_smoothing::Bool, deep::Bool, parsed_args::Dict{…})
    @ ClimaAtmos ~/ClimaAtmos.jl/src/utils/common_spaces.jl:124
 [28] get_spaces(parsed_args::Dict{…}, params::ClimaAtmos.Parameters.ClimaAtmosParameters{…}, comms_ctx::ClimaComms.SingletonCommsContext{…})
    @ ClimaAtmos ~/ClimaAtmos.jl/src/solver/type_getters.jl:224
 [29] get_simulation(config::ClimaAtmos.AtmosConfig{…})
    @ ClimaAtmos ~/ClimaAtmos.jl/src/solver/type_getters.jl:628
 [30] top-level scope
    @ REPL[6]:1

caused by: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/.julia/packages/CUDA/75aiI/lib/cudadrv/libcuda.jl:30
  [2] check
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuLaunchKernel
    @ ~/.julia/packages/CUDA/75aiI/lib/utils/call.jl:34 [inlined]
  [4] (::CUDA.var"#966#967"{…})(kernelParams::Vector{…})
    @ CUDA ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:66
  [5] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:33 [inlined]
  [6] macro expansion
    @ ./none:0 [inlined]
  [7] pack_arguments(::CUDA.var"#966#967"{…}, ::CUDA.KernelState, ::ClimaCore.DataLayouts.VIJFH{…}, ::Base.Broadcast.Broadcasted{…})
    @ CUDA ./none:0
  [8] launch(::CUDA.CuFunction, ::CUDA.KernelState, ::ClimaCore.DataLayouts.VIJFH{…}, ::Base.Broadcast.Broadcasted{…}; blocks::Tuple{…}, threads::Tuple{…}, cooperative::Bool, shmem::Int64, stream::CUDA.CuStream)
    @ CUDA ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:59
  [9] launch
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:52 [inlined]
 [10] #972
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:189 [inlined]
 [11] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:149 [inlined]
 [12] macro expansion
    @ ./none:0 [inlined]
 [13] convert_arguments
    @ ./none:0 [inlined]
 [14] #cudacall#971
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:191 [inlined]
 [15] cudacall
    @ ~/.julia/packages/CUDA/75aiI/lib/cudadrv/execution.jl:187 [inlined]
 [16] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:268 [inlined]
 [17] macro expansion
    @ ./none:0 [inlined]
 [18] call
    @ ./none:0 [inlined]
 [19] (::CUDA.HostKernel{…})(::ClimaCore.DataLayouts.VIJFH{…}, ::Base.Broadcast.Broadcasted{…}; threads::Tuple{…}, blocks::Tuple{…}, kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:390
 [20] kwcall(::NamedTuple, kernel::CUDA.HostKernel, args::Vararg{Any, N}) where N
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:389 [inlined]
 [21] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:114 [inlined]
 [22] auto_launch!(f!::typeof(ClimaCoreCUDAExt.knl_copyto!), args::Tuple{…}, data::ClimaCore.DataLayouts.VIJFH{…}; auto::Bool, threads_s::Tuple{…}, blocks_s::Tuple{…}, always_inline::Bool, caller::Symbol)
    @ ClimaCoreCUDAExt ~/.julia/packages/ClimaCore/ANgUC/ext/cuda/cuda_utils.jl:58
 [23] auto_launch!
    @ ~/.julia/packages/ClimaCore/ANgUC/ext/cuda/cuda_utils.jl:38 [inlined]
 [24] copyto!
    @ ~/.julia/packages/ClimaCore/ANgUC/ext/cuda/data_layouts.jl:71 [inlined]
 [25] copy
    @ ./broadcast.jl:928 [inlined]
 [26] materialize
    @ ./broadcast.jl:903 [inlined]
 [27] _ExtrudedFiniteDifferenceGrid(horizontal_grid::ClimaCore.Grids.SpectralElementGrid2D{…}, vertical_grid::ClimaCore.Grids.FiniteDifferenceGrid{…}, hypsography::ClimaCore.Grids.Flat, global_geometry::ClimaCore.Geometry.CartesianGlobalGeometry)
    @ ClimaCore.Grids ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:102
 [28] #18
    @ ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:86 [inlined]
 [29] get!(default::ClimaCore.Grids.var"#18#19"{…}, h::Dict{…}, key::Tuple{…})
    @ Base ./dict.jl:479
 [30] ExtrudedFiniteDifferenceGrid
    @ ClimaCore.Grids ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:76 [inlined] [31] #ExtrudedFiniteDifferenceGrid#17
    @ ClimaCore.Grids ~/.julia/packages/ClimaCore/ANgUC/src/Grids/extruded.jl:61 [inlined] [32] make_hybrid_spaces(h_space::ClimaCore.Spaces.SpectralElementSpace2D{…}, z_max::Float32, z_elem::Int64, z_stretch::ClimaCore.Meshes.GeneralizedExponentialStretching{…}; surface_warp::Nothing, topo_smoothing::Bool, deep::Bool, parsed_args::Dict{…})
    @ ClimaAtmos ~/ClimaAtmos.jl/src/utils/common_spaces.jl:124
 [33] get_spaces(parsed_args::Dict{…}, params::ClimaAtmos.Parameters.ClimaAtmosParameters{…}, comms_ctx::ClimaComms.SingletonCommsContext{…})
    @ ClimaAtmos ~/ClimaAtmos.jl/src/solver/type_getters.jl:224
 [34] get_simulation(config::ClimaAtmos.AtmosConfig{…})
    @ ClimaAtmos ~/ClimaAtmos.jl/src/solver/type_getters.jl:628
 [35] top-level scope
    @ REPL[6]:1
Some type information was truncated. Use `show(err)` to see complete types.

Maybe related to auto_launch!?

charleskawczynski commented 5 days ago

@Sbozzolo and I chatted offline about this, here's a recap:

In this case, the threads/blocks are being passed to auto_launch!, and auto = false, so it's an issue with https://github.com/CliMA/ClimaCore.jl/blob/ad457270882fddc58fc76c72e84b432e49c130ec/ext/cuda/data_layouts_copyto.jl#L52-L67. We could try out the new/generic version I implemented recently (https://github.com/CliMA/ClimaCore.jl/blob/ad457270882fddc58fc76c72e84b432e49c130ec/ext/cuda/data_layouts_copyto.jl#L113), I think that would work.

Some of the new kernels were merged because we simply didn't have support, and the remaining ones didn't replace the old ones because they were slightly slower than the old ones.

charleskawczynski commented 4 days ago

if this error is only happening for the single column, then maybe we can just replace out a single column case. I see no issue with this.

charleskawczynski commented 2 days ago
launch(::CUDA.CuFunction, ::CUDA.KernelState, ::ClimaCore.DataLayouts.VIJFH

Ah, this is actually a VIJFH datalayout! Perhaps unfortunately, ClimaAtmos's column model is actually still an extruded FD space: https://github.com/CliMA/ClimaAtmos.jl/blob/77d7f0beeb300f1d4c2147158c66c6fe69ba8c84/src/solver/type_getters.jl#L203-L225. I wonder if that should be updated. IIRC, there is a CPU performance implication of having a horizontal space for purely single columns (this is especially likely true on the GPU).

Using the patch in #1859 does get us past the posted error, but then we fail on dispatch (which is an easy fix):

ERROR: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore should be avoided.

If you want to allow scalar iteration, use `allowscalar` or `@allowscalar`
to enable scalar iteration globally or for the operations in question.
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] errorscalar(op::String)
    @ GPUArraysCore ~/.julia/packages/GPUArraysCore/GMsgk/src/GPUArraysCore.jl:155
  [3] _assertscalar(op::String, behavior::GPUArraysCore.ScalarIndexing)
    @ GPUArraysCore ~/.julia/packages/GPUArraysCore/GMsgk/src/GPUArraysCore.jl:128
  [4] assertscalar(op::String)
    @ GPUArraysCore ~/.julia/packages/GPUArraysCore/GMsgk/src/GPUArraysCore.jl:116
  [5] getindex(A::CUDA.CuArray{Float32, 5, CUDA.Mem.DeviceBuffer}, I::Int64)
    @ GPUArrays ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:48
  [6] scalar_getindex(::CUDA.CuArray{Float32, 5, CUDA.Mem.DeviceBuffer}, ::Int64, ::Vararg{Int64})
    @ GPUArrays ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:34
  [7] _getindex
    @ ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:17 [inlined]
  [8] getindex
    @ ~/.julia/packages/GPUArrays/OqrUV/src/host/indexing.jl:15 [inlined]
  [9] getindex
    @ ./subarray.jl:290 [inlined]
 [10] _getindex
    @ ./abstractarray.jl:1341 [inlined]
 [11] getindex
    @ ./abstractarray.jl:1291 [inlined]
 [12] get_struct
    @ ~/CliMA/ClimaCore.jl/src/DataLayouts/struct.jl:218 [inlined]
 [13] macro expansion
    @ ~/CliMA/ClimaCore.jl/src/DataLayouts/struct.jl:200 [inlined]
 [14] macro expansion
    @ ./none:0 [inlined]
 [15] get_struct
    @ ./none:0 [inlined]
 [16] getindex
    @ ~/CliMA/ClimaCore.jl/src/DataLayouts/DataLayouts.jl:1026 [inlined]
 [17] getindex
    @ ~/CliMA/ClimaCore.jl/src/DataLayouts/DataLayouts.jl:1033 [inlined]
 [18] _broadcast_getindex
    @ ./broadcast.jl:662 [inlined]
 [19] _getindex
    @ ./broadcast.jl:706 [inlined]
 [20] _getindex
    @ ./broadcast.jl:705 [inlined]
 [21] _broadcast_getindex
    @ ./broadcast.jl:681 [inlined]
 [22] getindex
    @ ./broadcast.jl:636 [inlined]
 [23] copyto!(dest::ClimaCore.DataLayouts.VF{…}, bc::Base.Broadcast.Broadcasted{…})
    @ ClimaCore.DataLayouts ~/CliMA/ClimaCore.jl/src/DataLayouts/broadcast.jl:576
 [24] copyto!(dest::ClimaCore.DataLayouts.VIJFH{…}, bc::Base.Broadcast.Broadcasted{…})
    @ ClimaCore.DataLayouts ~/CliMA/ClimaCore.jl/src/DataLayouts/broadcast.jl:604
 [25] copyto!
    @ ~/CliMA/ClimaCore.jl/src/Fields/broadcast.jl:149 [inlined]
 [26] materialize!
    @ ./broadcast.jl:914 [inlined]
 [27] materialize!
    @ ./broadcast.jl:911 [inlined]
 [28] macro expansion
    @ ~/CliMA/ClimaAtmos.jl/src/callbacks/callbacks.jl:81 [inlined]
 [29] rrtmgp_model_callback!(integrator::ClimaTimeSteppers.DistributedODEIntegrator{…})
    @ ClimaAtmos ~/.julia/packages/NVTX/pfSOQ/src/macro.jl:194
 [30] AtmosCallback
    @ ~/CliMA/ClimaAtmos.jl/src/solver/types.jl:455 [inlined]
 [31] #239
    @ ~/CliMA/ClimaAtmos.jl/src/callbacks/callback_helpers.jl:49 [inlined]
 [32] initialize!(u::ClimaCore.Fields.FieldVector{…}, t::Float32, integrator::ClimaTimeSteppers.DistributedODEIntegrator{…}, any_modified::Bool, c::SciMLBase.DiscreteCallback{…}, cs::SciMLBase.DiscreteCallback{…})
    @ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:13
 [33] initialize!(::ClimaCore.Fields.FieldVector{…}, ::Float32, ::ClimaTimeSteppers.DistributedODEIntegrator{…}, ::Bool, ::SciMLBase.DiscreteCallback{…}, ::SciMLBase.DiscreteCallback{…}, ::Vararg{…})
    @ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14
 [34] initialize!(::ClimaCore.Fields.FieldVector{…}, ::Float32, ::ClimaTimeSteppers.DistributedODEIntegrator{…}, ::Bool, ::SciMLBase.DiscreteCallback{…}, ::SciMLBase.DiscreteCallback{…}, ::Vararg{…})
    @ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14
 [35] initialize!(::ClimaCore.Fields.FieldVector{…}, ::Float32, ::ClimaTimeSteppers.DistributedODEIntegrator{…}, ::Bool, ::SciMLBase.DiscreteCallback{…}, ::SciMLBase.DiscreteCallback{…}, ::Vararg{…})
    @ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14
 [36] initialize! (repeats 2 times)
    @ ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:14 [inlined]
 [37] initialize!(cb::SciMLBase.CallbackSet{…}, u::ClimaCore.Fields.FieldVector{…}, t::Float32, integrator::ClimaTimeSteppers.DistributedODEIntegrator{…})
    @ DiffEqBase ~/.julia/packages/DiffEqBase/yM6LF/src/callbacks.jl:7
 [38] __init(::SciMLBase.ODEProblem{…}, ::ClimaTimeSteppers.IMEXAlgorithm{…}; dt::Float32, tstops::Tuple{}, saveat::Vector{…}, save_everystep::Bool, callback::SciMLBase.CallbackSet{…}, advance_to_tstop::Bool, save_func::ClimaTimeSteppers.var"#36#38", dtchangeable::Bool, stepstop::Int64, kwargs::@Kwargs{…})
    @ ClimaTimeSteppers ~/.julia/packages/ClimaTimeSteppers/UtGqn/src/integrators.jl:124
 [39] __init
    @ ~/.julia/packages/ClimaTimeSteppers/UtGqn/src/integrators.jl:68 [inlined]
 [40] #init_call#40
    @ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:530 [inlined]
 [41] init_call
    @ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:503 [inlined]
 [42] #init_up#43
    @ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:564 [inlined]
 [43] init_up
    @ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:551 [inlined]
 [44] #init#41
    @ ~/.julia/packages/DiffEqBase/yM6LF/src/solve.jl:544 [inlined]
 [45] macro expansion
    @ ~/CliMA/ClimaAtmos.jl/src/solver/type_getters.jl:708 [inlined]
 [46] macro expansion
    @ ./timing.jl:503 [inlined]
 [47] macro expansion
    @ ~/CliMA/ClimaAtmos.jl/src/utils/utilities.jl:331 [inlined]
 [48] get_simulation(config::ClimaAtmos.AtmosConfig{Float32, ClimaParams.ParamDict{…}, Dict{…}, ClimaComms.SingletonCommsContext{…}, Tuple{…}})
    @ ClimaAtmos ~/CliMA/ClimaAtmos.jl/src/solver/type_getters.jl:707
 [49] top-level scope
    @ REPL[7]:1
 [50] top-level scope
    @ ~/.julia/packages/CUDA/XUdwt/src/initialization.jl:209
Some type information was truncated. Use `show(err)` to see complete types.