JuliaDataCubes / YAXArrays.jl

Yet Another XArray-like Julia package
https://juliadatacubes.github.io/YAXArrays.jl/
Other
101 stars 17 forks source link

Using a larger than memory NetCDF file should not lead to getindex error from the backendlist #243

Open felixcremer opened 1 year ago

felixcremer commented 1 year ago

If we get a larger than memory output we currently get the following error, when we don't load Zarr. We should be either falling back to other backends that might be available or give a better error message.

# Dataset "lst"

lst = ds[Variable="t2m"]

YAXArray with the following dimensions
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
number              Axis with 50 Elements from 1 to 50
time                Axis with 721 Elements from 2019-05-01T00:00:00 to 2019-10-28T00:00:00
units: K
Total size: 2.74 GB

# Time handling
time_to_index = getAxis("time", lst)
time_index = yearmonthday.(time_to_index)
new_dates = unique(time_index)
index_in_cube = [findall(==(i), time_index) for i in unique(time_index)]

# Functions
function maximum_by_index(xout, xin; index_list = time_to_index)
    #@show size(xin)
    #@show typeof(xin)
    xout .= NaN
    if !all(isnan, xin)
        for i in eachindex(index_list)
            if !all(isnan, xin[index_list[i]])
                xout[i] = maximum(filter(!isnan, xin[index_list[i]]))
            end
        end
    end

end 

function dates_builder(x)
    out = Date[]
    for i in eachindex(x)
        push!(out, Date(x[i][1], x[i][2]))
    end

    return out
end

Indims = InDims("time")
outdims = OutDims(RangeAxis("time", dates_builder(new_dates)))

t2m_daily_high = mapCube(maximum_by_index, lst, indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)

Error message about a key with Zarr (the file is a netCDF).

┌ Warning: There are still cache misses
└ @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1070
KeyError: key :zarr not found

Stacktrace:
  [1] getindex(h::OrderedCollections.OrderedDict{Symbol, Any}, key::Symbol)
    @ OrderedCollections ~/.julia/packages/OrderedCollections/PRayh/src/ordered_dict.jl:380
  [2] getbackend(oc::YAXArrays.DAT.OutputCube, ispar::Base.RefValue{Bool}, max_cache::Float64)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:784
  [3] generateOutCube(oc::YAXArrays.DAT.OutputCube, ispar::Base.RefValue{Bool}, max_cache::Float64, loopcachesize::Tuple{Int64, Int64, Int64}, co::Tuple{Int64, Int64, Int64})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:846
  [4] (::YAXArrays.DAT.var"#131#132"{YAXArrays.DAT.DATConfig{1, 1}, Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}})(c::YAXArrays.DAT.OutputCube)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:842
  [5] foreach(f::YAXArrays.DAT.var"#131#132"{YAXArrays.DAT.DATConfig{1, 1}, Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}}, itr::Tuple{YAXArrays.DAT.OutputCube})
    @ Base ./abstractarray.jl:2694
  [6] generateOutCubes(dc::YAXArrays.DAT.DATConfig{1, 1})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:841
  [7] mapCube(::typeof(maximum_by_index), ::Tuple{YAXArray{Union{Missing, Float64}, 4, DiskArrays.SubDiskArray{Union{Missing, Float64}, 4}, Vector{CubeAxis}}}; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Vector{Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:472
  [8] #mapCube#36
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
  [9] top-level scope
    @ In[13]:1
 [10] eval
    @ ./boot.jl:373 [inlined]
 [11] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1196

Edit - The code work if I extract a single member of the ensemble (e.g. number=21):

t2m_daily_high = mapCube(maximum_by_index, ds[Variable="t2m", number=21], indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)

YAXArray with the following dimensions
time                Axis with 181 Elements from 2019-05-01 to 2019-10-01
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
Total size: 42.26 MB

Originally posted by @Balinus in https://github.com/JuliaDataCubes/YAXArrays.jl/issues/217#issuecomment-1471778989