Open simone-silvestri opened 2 months ago
Some ideas to reduce the issue:
similarity_theory
argumentSwitching the grid to a LatitudeLongitudeGrid
leads to
julia> coupled_model = OceanSeaIceModel(ocean; atmosphere, radiation)
ERROR: Kernel invocation uses too much parameter memory.
4.391 KiB exceeds the 4.000 KiB limit imposed by sm_70 / PTX v7.8.
Relevant parameters:
[1] __ctx__::KernelAbstractions.CompilerMetadata{Oceananigans.Utils.OffsetStaticSize{(0:51, 0:51)}, KernelAbstractions.NDIteration.DynamicCheck, Nothing, Nothing, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.StaticSize{(4, 4)}, KernelAbstractions.NDIteration.StaticSize{(16, 16)}, Tuple{Int64, Int64}, Oceananigans.Utils.KernelOffsets{Tuple{Int64, Int64}}}} uses 32 bytes
[2] similarity_theory::SimilarityTheoryTurbulentFluxes{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.SimilarityScales{ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.MomentumStabilityFunction{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarStabilityFunction{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarStabilityFunction{Float64}}, ClimaOcean.OceanSeaIceModels.PrescribedAtmospheres.PrescribedAtmosphereThermodynamicsParameters{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ClasiusClapyeronSaturation, Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.SimilarityScales{ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.MomentumRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ReynoldsScalingFunction{Float64}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ReynoldsScalingFunction{Float64}}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.LogarithmicSimilarityProfile, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.RelativeVelocity, @NamedTuple{latent_heat::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, sensible_heat::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, water_vapor::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, x_momentum::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, y_momentum::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}}} uses 984 bytes
[3] grid::ImmersedBoundaryGrid{Float64, Periodic, Bounded, Bounded, LatitudeLongitudeGrid{Float64, Periodic, Bounded, Bounded, OffsetArrays.OffsetVector{Float64, CUDA.CuDeviceVector{Float64, 1}}, Float64, Float64, Float64, OffsetArrays.OffsetVector{Float64, CUDA.CuDeviceVector{Float64, 1}}, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, CUDA.CuDeviceVector{Float64, 1}}, Nothing}, GridFittedBottom{Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, Oceananigans.ImmersedBoundaries.CenterImmersedCondition}, CUDA.CuDeviceVector{Tuple{UInt8, UInt8, UInt8}, 1}, CUDA.CuDeviceVector{Tuple{UInt8, UInt8}, 1}, Nothing} uses 968 bytes
[4] clock::@NamedTuple{time::Float64, last_Δt::Float64, last_stage_Δt::Float64, iteration::Int64, stage::Int64} uses 40 bytes
[5] ocean_state::@NamedTuple{u::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, v::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, w::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, T::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, S::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, e::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}} uses 768 bytes
[7] atmos_state::@NamedTuple{u::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, v::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, T::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, q::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, r::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, p::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}} uses 528 bytes
[8] atmos_grid::Oceananigans.Grids.ZRegularLLG{Float32, Periodic, Bounded, Flat, OffsetArrays.OffsetMatrix{Float32, CUDA.CuDeviceMatrix{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, Float32, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}, Nothing} uses 1024 bytes
[9] atmos_times::StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64} uses 48 bytes
[10] atmos_backend::JRA55NetCDFBackend uses 16 bytes
[11] atmos_time_indexing::Oceananigans.OutputReaders.Cyclical{Float64} uses 8 bytes
[12] atmosphere_reference_height::Float32 uses 4 bytes
[13] atmosphere_boundary_layer_height::Float32 uses 4 bytes
[14] atmos_thermodynamics_parameters::ClimaOcean.OceanSeaIceModels.PrescribedAtmospheres.PrescribedAtmosphereThermodynamicsParameters{Float32} uses 56 bytes
Note: use a newer CUDA to support more parameters on your device.
We could also simplify the kernel I think.
On the active cells map issue --- this seems like a good idea in general, but requires changes in Oceananigans that pass the active cells map explicitly where needed, right? This seems like a relatively simple improvement so maybe we should do that regardless.
Note: use a newer CUDA to support more parameters on your device.
Amazing that the atmos_grid
uses so much parameter space. Looking at this I am not sure that removing the active cells map will do anything. The problem is the metrics?
The ocean_state
is a clear target for improvement because it looks like way are paying the price for passing SubArray
, instead of simply passing in entire fields. It's easy to pass the whole field. We just have to change the index from 1
to the surface, grid.Nz
.
A more dramatic savings could be achieved by unwrapping all the OffsetArray
s. Many of the arrays have the same offsets, so it's redundant to pass them in for every property. Unfortunately this would complicate writing the kernels though.
Also the end of the error says
Note: use a newer CUDA to support more parameters on your device.
Do we know if simply updating CUDA will solve this problem, even for the old Titan V's?
To start we could unwrap the OffsetArrays
of the grid metrics and change the grid metric operators (the Δx, Δy, znode, xnode, etc...) to include a displacement by the halo.
We could dispatch on the type of the metric (Array vs OffsetArray) to figure out how to call the grid metric operator
For the MWE I get an error:
julia> include("parameter_space_test.jl")
Precompiling ClimaOcean
Info Given ClimaOcean was explicitly requested, output will be shown live
WARNING: using Units.day in module ECCO conflicts with an existing identifier.
28 dependencies successfully precompiled in 47 seconds. 221 already precompiled.
1 dependency had output during precompilation:
┌ ClimaOcean
│ [Output was shown above]
└
ERROR: LoadError: MethodError: no method matching (OrthogonalSphericalShellGrid{…} where {…})(::GPU, ::Type{…}; size::Tuple{…}, halo::Tuple{…}, z::Vector{…}, longitude::Tuple{…}, latitude::Tuple{…})
Closest candidates are:
(OrthogonalSphericalShellGrid{FT, TX, TY, TZ, A, R, FR, <:OrthogonalSphericalShellGrids.Tripolar, Arch} where {FT, TX, TY, TZ, A, R, FR, Arch})(::Any, ::DataType; size, southermost_latitude, halo, radius, z, north_poles_latitude, first_pole_longitude) got unsupported keyword arguments "longitude", "latitude"
@ OrthogonalSphericalShellGrids ~/.julia/packages/OrthogonalSphericalShellGrids/b8yUQ/src/tripolar_grid.jl:55
(OrthogonalSphericalShellGrid{FT, TX, TY, TZ, A, R, FR, <:OrthogonalSphericalShellGrids.Tripolar, Arch} where {FT, TX, TY, TZ, A, R, FR, Arch})(::Any; ...)
@ OrthogonalSphericalShellGrids ~/.julia/packages/OrthogonalSphericalShellGrids/b8yUQ/src/tripolar_grid.jl:55
(OrthogonalSphericalShellGrid{FT, TX, TY, TZ, A, R, FR, <:OrthogonalSphericalShellGrids.Tripolar, Arch} where {FT, TX, TY, TZ, A, R, FR, Arch})(::Distributed, ::DataType; halo, kwargs...)
@ OrthogonalSphericalShellGrids ~/.julia/packages/OrthogonalSphericalShellGrids/b8yUQ/src/distributed_tripolar_grid.jl:24
Stacktrace:
[1] kwerr(::@NamedTuple{size::Tuple{…}, halo::Tuple{…}, z::Vector{…}, longitude::Tuple{…}, latitude::Tuple{…}}, ::Type, ::GPU, ::Type)
@ Base ./error.jl:165
[2] top-level scope
@ ~/Projects/ClimaOcean.jl/parameter_space_test.jl:6
[3] include(fname::String)
@ Base.MainInclude ./client.jl:489
[4] top-level scope
@ REPL[1]:1
in expression starting at /home/greg/Projects/ClimaOcean.jl/parameter_space_test.jl:6
Some type information was truncated. Use `show(err)` to see complete types.
What version of OrthogonalSphericalShellGrids
do I need. Also should ClimaOcean
depend on this?
To start we could unwrap the
OffsetArrays
of the grid metrics and change the grid metric operators (the Δx, Δy, znode, xnode, etc...) to include a displacement by the halo.We could dispatch on the type of the metric (Array vs OffsetArray) to figure out how to call the grid metric operator
Interesting idea to build the offset into the operators. I think that would work. We would have to dispatch those operators on CuArray
or something like that.
Is this right?
grid = TripolarGrid(arch;
size = (50, 50, 10),
halo = (7, 7, 7),
z = collect(-6000:600:0),
longitude = (0, 360),
latitude = (-75, 75))
the kwargs are wrong.
Is this right?
grid = TripolarGrid(arch; size = (50, 50, 10), halo = (7, 7, 7), z = collect(-6000:600:0), longitude = (0, 360), latitude = (-75, 75))
A no, the latitude and longitude are not parameters at the moment. The support is only for complete "capped" earth, so the longitude is fixed (0 to 360) as well as the maximum latitude which is 90. The kwargs allow to specify where the poles are located (first_pole_longitude
and north_poles_latitude
) as well as the southernmost latitude
Can you update the MWE so that it works please
I have update it
Where does the bottom_height
in this example come from? Confused...
Why would you use z = collect(-6000:600:0)
versus z = (-6000, 0)
?
Does ocean_simulation
not use a good default split-explicit setup?
Getting a different error:
julia> include("parameter_space_test.jl")
[ Info: Regridding bathymetry from existing file ./ETOPO_2022_v1_60s_N90W180_surface.nc.
┌ Warning: The westernmost meridian of `target_grid` 0 does not coincide with the closest meridian of the bathymetry grid, -1.4210854715202004e-14.
└ @ ClimaOcean.Bathymetry ~/Projects/ClimaOcean.jl/src/Bathymetry.jl:147
┌ Warning: The southernmost parallel of `target_grid` -81.73682203400921 does not coincide with the closest parallel of the bathymetry grid, -81.73333333333332.
└ @ ClimaOcean.Bathymetry ~/Projects/ClimaOcean.jl/src/Bathymetry.jl:151
ERROR: LoadError: InvalidIRError: compiling MethodInstance for Oceananigans.Models.HydrostaticFreeSurfaceModels.gpu_compute_hydrostatic_free_surface_Gc!(::KernelAbstractions.CompilerMetadata{…}, ::OffsetArrays.OffsetArray{…}, ::ImmersedBoundaryGrid{…}, ::Oceananigans.ImmersedBoundaries.InteriorMap, ::Tuple{…}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to _advective_tracer_flux_x)
Stacktrace:
[1] δxᶜᵃᵃ
@ ~/.julia/packages/Oceananigans/OMBY0/src/Operators/difference_operators.jl:20
[2] div_Uc
@ ~/.julia/packages/Oceananigans/OMBY0/src/Advection/tracer_advection_operators.jl:95
[3] hydrostatic_free_surface_tracer_tendency
That is solved in Oceananigans#main
Does
ocean_simulation
not use a good default split-explicit setup?
Not for the tripolar grid. The tripolar grid requires a fixed Δt
for the barotropic solver, while the default in ClimaOcean is a variable Δt
.
That is solved in
Oceananigans#main
I updated the original post to indicate which branches should be used.
Does
ocean_simulation
not use a good default split-explicit setup?Not for the tripolar grid. The tripolar grid requires a fixed
Δt
for the barotropic solver, while the default in ClimaOcean is a variableΔt
.
Can you open an issue to describe how we need to generalize ocean_simulation
to have good defaults that depend on the grid
being used?
Ok, this code:
using Oceananigans
using ClimaOcean
using OrthogonalSphericalShellGrids
arch = GPU()
grid = TripolarGrid(arch;
size = (50, 50, 10),
halo = (7, 7, 7),
z = collect(-6000:600:0),
first_pole_longitude = 75,
north_poles_latitude = 55)
bottom_height = retrieve_bathymetry(grid;
minimum_depth = 10,
dir = "./",
interpolation_passes = 20,
connected_regions_allowed = 0)
grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom_height); active_cells_map = true)
free_surface = SplitExplicitFreeSurface(grid; substeps = 20)
ocean = ocean_simulation(grid; free_surface)
model = ocean.model
backend = JRA55NetCDFBackend(4)
atmosphere = JRA55_prescribed_atmosphere(arch; backend)
radiation = Radiation(arch)
sea_ice = ClimaOcean.OceanSeaIceModels.MinimumTemperatureSeaIce()
coupled_model = OceanSeaIceModel(ocean, sea_ice; atmosphere, radiation)
produces
julia> include("parameter_space_test.jl")
[ Info: Regridding bathymetry from existing file ./ETOPO_2022_v1_60s_N90W180_surface.nc.
┌ Warning: The westernmost meridian of `target_grid` 0 does not coincide with the closest meridian of the bathymetry grid, -1.4210854715202004e-14.
└ @ ClimaOcean.Bathymetry ~/Projects/ClimaOcean.jl/src/Bathymetry.jl:147
┌ Warning: The southernmost parallel of `target_grid` -81.73682203400921 does not coincide with the closest parallel of the bathymetry grid, -81.73333333333332.
└ @ ClimaOcean.Bathymetry ~/Projects/ClimaOcean.jl/src/Bathymetry.jl:151
┌ Warning: This simulation will run forever as stop iteration = stop time = wall time limit = Inf.
└ @ Oceananigans.Simulations ~/.julia/packages/Oceananigans/llWXH/src/Simulations/simulation.jl:55
ERROR: LoadError: Kernel invocation uses too much parameter memory.
4.820 KiB exceeds the 4.000 KiB limit imposed by sm_70 / PTX v7.8.
Relevant parameters:
[1] __ctx__::KernelAbstractions.CompilerMetadata{Oceananigans.Utils.OffsetStaticSize{(0:51, 0:51)}, KernelAbstractions.NDIteration.DynamicCheck, Nothing, Nothing, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.StaticSize{(4, 4)}, KernelAbstractions.NDIteration.StaticSize{(16, 16)}, Tuple{Int64, Int64}, Oceananigans.Utils.KernelOffsets{Tuple{Int64, Int64}}}} uses 32 bytes
[2] similarity_theory::SimilarityTheoryTurbulentFluxes{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.SimilarityScales{ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.MomentumStabilityFunction{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarStabilityFunction{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarStabilityFunction{Float64}}, ClimaOcean.OceanSeaIceModels.PrescribedAtmospheres.PrescribedAtmosphereThermodynamicsParameters{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ClasiusClapyeronSaturation, Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.SimilarityScales{ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.MomentumRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ReynoldsScalingFunction{Float64}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ReynoldsScalingFunction{Float64}}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.LogarithmicSimilarityProfile, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.RelativeVelocity, @NamedTuple{latent_heat::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, sensible_heat::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, water_vapor::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, x_momentum::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, y_momentum::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}}} uses 984 bytes
[3] grid::ImmersedBoundaryGrid{Float64, Periodic, RightConnected, Bounded, OrthogonalSphericalShellGrid{Float64, Periodic, RightConnected, Bounded, OffsetArrays.OffsetMatrix{Float64, CUDA.CuDeviceMatrix{Float64, 1}}, OffsetArrays.OffsetVector{Float64, CUDA.CuDeviceVector{Float64, 1}}, OffsetArrays.OffsetVector{Float64, CUDA.CuDeviceVector{Float64, 1}}, OrthogonalSphericalShellGrids.Tripolar{Int64, Int64, Int64}, Nothing}, GridFittedBottom{Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, Oceananigans.ImmersedBoundaries.CenterImmersedCondition}, CUDA.CuDeviceVector{Tuple{UInt8, UInt8, UInt8}, 1}, CUDA.CuDeviceVector{Tuple{UInt8, UInt8}, 1}, Nothing} uses 1.469 KiB
[4] clock::@NamedTuple{time::Float64, last_Δt::Float64, last_stage_Δt::Float64, iteration::Int64, stage::Int64} uses 40 bytes
[5] ocean_state::@NamedTuple{u::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, v::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, w::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, T::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, S::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, e::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}} uses 768 bytes
[7] atmos_state::@NamedTuple{u::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, v::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, T::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, q::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, r::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, p::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}} uses 528 bytes
[8] atmos_grid::Oceananigans.Grids.ZRegularLLG{Float32, Periodic, Bounded, Flat, OffsetArrays.OffsetMatrix{Float32, CUDA.CuDeviceMatrix{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, Float32, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, Nothing, Nothing} uses 928 bytes
[9] atmos_times::StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64} uses 48 bytes
[10] atmos_backend::JRA55NetCDFBackend uses 16 bytes
[11] atmos_time_indexing::Oceananigans.OutputReaders.Cyclical{Float64} uses 8 bytes
[12] atmosphere_reference_height::Float32 uses 4 bytes
[13] atmosphere_boundary_layer_height::Float32 uses 4 bytes
[14] atmos_thermodynamics_parameters::ClimaOcean.OceanSeaIceModels.PrescribedAtmospheres.PrescribedAtmosphereThermodynamicsParameters{Float32} uses 56 bytes
Note: use a newer CUDA to support more parameters on your device.
Slightly different from the original post, but close enough.
Here's a table of the parameter sizes for this example:
parameter | memory use |
---|---|
ctx | 32 bytes |
similarity_theory | 984 bytes |
grid | 1.469 KiB |
clock | 40 bytes |
ocean_state | 768 bytes |
atmos_state | 528 bytes |
atmos_grid | 928 bytes |
atmos_times | 48 bytes |
atmos_backend | 16 bytes |
atmos_time_indexing | 8 bytes |
atmosphere_reference_height | 4 bytes |
atmosphere_boundary_layer_height | 4 bytes |
atmos_thermodynamics_parameters | 56 bytes |
Two main ideas:
Pre-interpolate the atmos state. This will save about 1.5 KiB (but will also cost approx 768 b to pass in the saved state), resulting in a savings of about 700 b.
Pass in pointers to the whole ocean array rather than a view of the surface. This might save 200-300 b in parameter space, because a view of an OffsetArray is more complex than just an OffsetArray.
Note that changing z = collect(-6000:600:0)
to z = (-6000, 0)
saves very little parameter space (the grid
goes from 1.469 KiB to 1.438 KiB). That may be the difference between an offset StepRangeLen
and an offset CuDeviceArray
(or two)
Apparently also removing the active cells map does not save as much parameter space as I hoped. This is by changing grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom_height); active_cells_map = true)
to grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom_height))
Relevant parameters:
[1] __ctx__::KernelAbstractions.CompilerMetadata{Oceananigans.Utils.OffsetStaticSize{(0:51, 0:51)}, KernelAbstractions.NDIteration.DynamicCheck, Nothing, Nothing, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.StaticSize{(4, 4)}, KernelAbstractions.NDIteration.StaticSize{(16, 16)}, Tuple{Int64, Int64}, Oceananigans.Utils.KernelOffsets{Tuple{Int64, Int64}}}} uses 32 bytes
[2] similarity_theory::SimilarityTheoryTurbulentFluxes{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.SimilarityScales{ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.MomentumStabilityFunction{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarStabilityFunction{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarStabilityFunction{Float64}}, ClimaOcean.OceanSeaIceModels.PrescribedAtmospheres.PrescribedAtmosphereThermodynamicsParameters{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ClasiusClapyeronSaturation, Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.SimilarityScales{ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.MomentumRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ReynoldsScalingFunction{Float64}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ScalarRoughnessLength{Float64, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.TemperatureDependentAirViscosity{Float64}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.ReynoldsScalingFunction{Float64}}}, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.LogarithmicSimilarityProfile, ClimaOcean.OceanSeaIceModels.CrossRealmFluxes.RelativeVelocity, @NamedTuple{latent_heat::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, sensible_heat::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, water_vapor::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, x_momentum::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, y_momentum::Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}}} uses 984 bytes
[3] grid::ImmersedBoundaryGrid{Float64, Periodic, RightConnected, Bounded, OrthogonalSphericalShellGrid{Float64, Periodic, RightConnected, Bounded, OffsetArrays.OffsetMatrix{Float64, CUDA.CuDeviceMatrix{Float64, 1}}, OffsetArrays.OffsetVector{Float64, CUDA.CuDeviceVector{Float64, 1}}, OffsetArrays.OffsetVector{Float64, CUDA.CuDeviceVector{Float64, 1}}, OrthogonalSphericalShellGrids.Tripolar{Int64, Int64, Int64}, Nothing}, GridFittedBottom{Field{Center, Center, Nothing, Nothing, Nothing, Nothing, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Float64, Nothing, Nothing, Nothing}, Oceananigans.ImmersedBoundaries.CenterImmersedCondition}, Nothing, Nothing, Nothing} uses 1.406 KiB
[4] clock::@NamedTuple{time::Float64, last_Δt::Float64, last_stage_Δt::Float64, iteration::Int64, stage::Int64} uses 40 bytes
[5] ocean_state::@NamedTuple{u::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, v::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, w::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, T::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, S::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}, e::SubArray{Float64, 2, OffsetArrays.OffsetArray{Float64, 3, CUDA.CuDeviceArray{Float64, 3, 1}}, Tuple{Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Base.Slice{OffsetArrays.IdOffsetRange{Int64, Base.OneTo{Int64}}}, Int64}, true}} uses 768 bytes
[7] atmos_state::@NamedTuple{u::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, v::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, T::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, q::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, r::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}, p::OffsetArrays.OffsetArray{Float32, 4, CUDA.CuDeviceArray{Float32, 4, 1}}} uses 528 bytes
[8] atmos_grid::Oceananigans.Grids.ZRegularLLG{Float32, Periodic, Bounded, Flat, OffsetArrays.OffsetMatrix{Float32, CUDA.CuDeviceMatrix{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, Float32, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, OffsetArrays.OffsetVector{Float32, CUDA.CuDeviceVector{Float32, 1}}, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}, Nothing} uses 1024 bytes
[9] atmos_times::StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64} uses 48 bytes
[10] atmos_backend::JRA55NetCDFBackend uses 16 bytes
[11] atmos_time_indexing::Oceananigans.OutputReaders.Cyclical{Float64} uses 8 bytes
[12] atmosphere_reference_height::Float32 uses 4 bytes
[13] atmosphere_boundary_layer_height::Float32 uses 4 bytes
[14] atmos_thermodynamics_parameters::ClimaOcean.OceanSeaIceModels.PrescribedAtmospheres.PrescribedAtmosphereThermodynamicsParameters{Float32} uses 56 bytes
so grid goes from 1.469 KiB to 1.406 KiB
Our fluxes computation requires a lot of parameter space and fails on older GPUs when using a tripolar grid.
MWE: using both
Oceananigans#main
andOrthogonalSphericalShellGrids#main
On a Titan V it leads to the code failing with this error