JuliaDataCubes / YAXArrays.jl

Yet Another XArray-like Julia package
https://juliadatacubes.github.io/YAXArrays.jl/
Other
100 stars 16 forks source link

Estimating statistics per month #217

Open dpabon opened 1 year ago

dpabon commented 1 year ago

In case someone else needs it, here is my implementation for estimating the median per month:

using using EarthDataLab
using YAXArrays
using Statistics
earth_dataset = esdd()

# Checking the properties of the land surface temperature product
earth_dataset.land_surface_temperature.properties

#=
"long_name"                => "Land Surface Temperature"
  "time_coverage_end"        => "2011-12-31"
  "time_coverage_resolution" => "P8D"
  "time_coverage_start"      => "2002-05-21"
  "name"                     => "land_surface_temperature"
  "esa_cci_path"             => "NaN"
  "orig_version"             => "NaN"
=#

# MCD12Q1

lst = earth_dataset.land_surface_temperature[time = (Date("2002-05-21"),Date("2011-12-31"))]

# let's estimate the median per month:

time_to_index = getAxis("time", lst)

time_index = yearmonth.(time_to_index)

new_dates = unique(time_index)

index_in_cube = [findall(==(i), time_index) for i in unique(time_index)]

function median_by_index(xout, xin; index_list = time_to_index)
    #@show size(xin)
    #@show typeof(xin)
    xout .= NaN
    if !all(isnan, xin)
        for i in eachindex(index_list)
            if !all(isnan, xin[index_list[i]])
                xout[i] = median(filter(!isnan, xin[index_list[i]]))
            end
        end
    end

end 

function dates_builder(x)
    out = Date[]
    for i in eachindex(x)
        push!(out, Date(x[i][1], x[i][2]))
    end

    return out
end

Indims = InDims("Time")

outdims = OutDims(RangeAxis("time", dates_builder(new_dates)))

lst_monthly_high = mapCube(median_by_index, lst, indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)
Balinus commented 1 year ago

Thanks for thise code! Very useful. I' m trying to do something similar to get daily maximum/minimum/mean values for 6-hourly dataset. Works great!

However, I have 50-member (so I have an additional dimention number). I'm not sure how to use your code to include another dimensions.

The original dimensions are :

YAXArray with the following dimensions
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
number              Axis with 50 Elements from 1 to 50
time                Axis with 721 Elements from 2019-05-01T00:00:00 to 2019-10-28T00:00:00
Variable            Axis with 16 elements: fg10 mn2t24 .. tclw mx2t24 
units: K
Total size: 43.84 GB

and I'm trying to have something like (181 time elements).

YAXArray with the following dimensions
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
number              Axis with 50 Elements from 1 to 50
time                Axis with 181 Elements from 2019-05-01T00:00:00 to 2019-10-28T00:00:00
Variable            Axis with 16 elements: fg10 mn2t24 .. tclw mx2t24 
units: K
Total size: 43.84 GB

My guess is something using InDims/OutDims but I must admint I don't understand yet everything. Or can I just add a loop in the function median_by_index over the number dimensions?

Any help would be much welcome! Cheers!

felixcremer commented 1 year ago

Have you tried to apply the function as is? YAXArrays should make the loop over the number dimension as well. If you want to keep min, Max and mean you could add an Stats dimension to the outdims. outdims = OutDims(RangeAxis("time", dates_builder(new_dates)), CategoricalAxis("stats",["min","max","mean"]))

Balinus commented 1 year ago

Thanks!

Yes, I tried and I get an error I have difficulty to understand. Here's my code and error msg.

# Dataset "lst"

lst = ds[Variable="t2m"]

YAXArray with the following dimensions
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
number              Axis with 50 Elements from 1 to 50
time                Axis with 721 Elements from 2019-05-01T00:00:00 to 2019-10-28T00:00:00
units: K
Total size: 2.74 GB

# Time handling
time_to_index = getAxis("time", lst)
time_index = yearmonthday.(time_to_index)
new_dates = unique(time_index)
index_in_cube = [findall(==(i), time_index) for i in unique(time_index)]

# Functions
function maximum_by_index(xout, xin; index_list = time_to_index)
    #@show size(xin)
    #@show typeof(xin)
    xout .= NaN
    if !all(isnan, xin)
        for i in eachindex(index_list)
            if !all(isnan, xin[index_list[i]])
                xout[i] = maximum(filter(!isnan, xin[index_list[i]]))
            end
        end
    end

end 

function dates_builder(x)
    out = Date[]
    for i in eachindex(x)
        push!(out, Date(x[i][1], x[i][2]))
    end

    return out
end

Indims = InDims("time")
outdims = OutDims(RangeAxis("time", dates_builder(new_dates)))

t2m_daily_high = mapCube(maximum_by_index, lst, indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)

Error message about a key with Zarr (the file is a netCDF).

┌ Warning: There are still cache misses
└ @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1070
KeyError: key :zarr not found

Stacktrace:
  [1] getindex(h::OrderedCollections.OrderedDict{Symbol, Any}, key::Symbol)
    @ OrderedCollections ~/.julia/packages/OrderedCollections/PRayh/src/ordered_dict.jl:380
  [2] getbackend(oc::YAXArrays.DAT.OutputCube, ispar::Base.RefValue{Bool}, max_cache::Float64)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:784
  [3] generateOutCube(oc::YAXArrays.DAT.OutputCube, ispar::Base.RefValue{Bool}, max_cache::Float64, loopcachesize::Tuple{Int64, Int64, Int64}, co::Tuple{Int64, Int64, Int64})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:846
  [4] (::YAXArrays.DAT.var"#131#132"{YAXArrays.DAT.DATConfig{1, 1}, Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}})(c::YAXArrays.DAT.OutputCube)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:842
  [5] foreach(f::YAXArrays.DAT.var"#131#132"{YAXArrays.DAT.DATConfig{1, 1}, Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}}, itr::Tuple{YAXArrays.DAT.OutputCube})
    @ Base ./abstractarray.jl:2694
  [6] generateOutCubes(dc::YAXArrays.DAT.DATConfig{1, 1})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:841
  [7] mapCube(::typeof(maximum_by_index), ::Tuple{YAXArray{Union{Missing, Float64}, 4, DiskArrays.SubDiskArray{Union{Missing, Float64}, 4}, Vector{CubeAxis}}}; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Vector{Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:472
  [8] #mapCube#36
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
  [9] top-level scope
    @ In[13]:1
 [10] eval
    @ ./boot.jl:373 [inlined]
 [11] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1196

Edit - The code work if I extract a single member of the ensemble (e.g. number=21):

t2m_daily_high = mapCube(maximum_by_index, ds[Variable="t2m", number=21], indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)

YAXArray with the following dimensions
time                Axis with 181 Elements from 2019-05-01 to 2019-10-01
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
Total size: 42.26 MB
felixcremer commented 1 year ago

This is only a problem with the size of the array. If you extract the computation can be handled in memory, but when you try to do the computation for all 50 ensembles it has to save the results somewhere and it tries to use the Zarr backend as a default, but this is not loaded. To circumvent this problem you can either load the Zarr package via using Zarr or you can force it to use the NetCDF backend by specifiying this in the OutDims.

outdims = OutDims(RangeAxis("time", dates_builder(new_dates)), backend=:netcdf)
felixcremer commented 1 year ago

Admittedly, this is a bad error message, therefore I opened a new issue to track this error message specifically.

Balinus commented 1 year ago

Ah, thanks! Indeed, it was simply a matter of loading the Zarr library. Thanks!

There was another problem that seems linked to the data. I currently have 16 variables in the Cube and some of them fails when I try to calculate the daily maximum values. For instance :

@time fg10_daily_high = mapCube(maximum_by_index, lst[Variable="fg10"], indims = indims, outdims = outdims; index_list = index_in_cube, showprog = true, max_cache=1.0e9)

return the following error:

┌ Warning: There are still cache misses
└ @ YAXArrays.DAT /gpfs/home/dl2594/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1070
TaskFailedException

    nested task error: TypeError: non-boolean (Missing) used in boolean context
    Stacktrace:
     [1] maximum_by_index(xout::Vector{Union{Missing, Float64}}, xin::Vector{Union{Missing, Float64}}; index_list::Vector{Vector{Int64}})
       @ Main ./In[23]:7
     [2] innercode(f::Function, cI::CartesianIndex{3}, xinBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, xoutBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, filters::Tuple{Tuple{YAXArrays.DAT.AllMissing}}, inwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, outwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, axvalcreator::YAXArrays.DAT.NoLoopAxes, offscur::Tuple{Int64, Int64, Int64}, addargs::Tuple{}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
       @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1123
     [3] macro expansion
       @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1162 [inlined]
     [4] (::YAXArrays.DAT.var"#395#threadsfor_fun#175"{typeof(maximum_by_index), Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, Tuple{Tuple{YAXArrays.DAT.AllMissing}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, YAXArrays.DAT.NoLoopAxes, Tuple{}, Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}}, Tuple{Int64, Int64, Int64}, CartesianIndices{3, Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}}}})(onethread::Bool)
       @ YAXArrays.DAT ./threadingconstructs.jl:85
     [5] (::YAXArrays.DAT.var"#395#threadsfor_fun#175"{typeof(maximum_by_index), Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, Tuple{Tuple{YAXArrays.DAT.AllMissing}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, YAXArrays.DAT.NoLoopAxes, Tuple{}, Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}}, Tuple{Int64, Int64, Int64}, CartesianIndices{3, Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}}}})()
       @ YAXArrays.DAT ./threadingconstructs.jl:52

Stacktrace:
  [1] wait
    @ ./task.jl:334 [inlined]
  [2] threading_run(func::Function)
    @ Base.Threads ./threadingconstructs.jl:38
  [3] macro expansion
    @ ./threadingconstructs.jl:97 [inlined]
  [4] innerLoop(loopRanges::Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}}, f::Function, xinBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, xoutBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, filters::Tuple{Tuple{YAXArrays.DAT.AllMissing}}, inwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, outwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, axvalcreator::YAXArrays.DAT.NoLoopAxes, addargs::Tuple{}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1161
  [5] (::YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}})(r::Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:709
  [6] (::ProgressMeter.var"#56#59"{Distributed.RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}}})(x::Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1016
  [7] iterate
    @ ./generator.jl:47 [inlined]
  [8] _collect(c::DiskArrays.GridChunks{3}, itr::Base.Generator{DiskArrays.GridChunks{3}, ProgressMeter.var"#56#59"{Distributed.RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}}}}, #unused#::Base.EltypeUnknown, isz::Base.HasShape{3})
    @ Base ./array.jl:744
  [9] collect_similar(cont::DiskArrays.GridChunks{3}, itr::Base.Generator{DiskArrays.GridChunks{3}, ProgressMeter.var"#56#59"{Distributed.RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}}}})
    @ Base ./array.jl:653
 [10] map(f::Function, A::DiskArrays.GridChunks{3})
    @ Base ./abstractarray.jl:2849
 [11] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1015 [inlined]
 [12] macro expansion
    @ ./task.jl:399 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1014 [inlined]
 [14] macro expansion
    @ ./task.jl:399 [inlined]
 [15] progress_map(::Function, ::Vararg{Any}; mapfun::Function, progress::ProgressMeter.Progress, channel_bufflen::Int64, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1007
 [16] progress_map(::Function, ::Vararg{Any})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1003
 [17] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:707
 [18] mapCube(::typeof(maximum_by_index), ::Tuple{YAXArray{Union{Missing, Float64}, 4, DiskArrays.SubDiskArray{Union{Missing, Float64}, 4}, Vector{CubeAxis}}}; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Vector{Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:475
 [19] #mapCube#36
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
 [20] top-level scope
    @ ./timing.jl:220 [inlined]
 [21] top-level scope
    @ ./In[51]:0
 [22] eval
    @ ./boot.jl:373 [inlined]
 [23] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1196

If I remove the "bad" data, I can use the function as-is and do the calculations over all variables and member. That is like magic!

Balinus commented 1 year ago

The problem seems related to the presence of missing values. Is there a function or option in open_dataset where we can set missing values to NaN or something else? I've looked at documentation and InDims seems to have options about missing values, but I'm unsure how to use it(?)

Balinus commented 1 year ago

Found my solution. The function was basically filtering for NaN. I just modified it to test for missing values.

I tried some benchmark for a sub-sample of 1 month of 6-hourly data and performance is similar to what I can get from ds.resample(time="1D").max().compute(). About 7 seconds.

Sadly, when using the whole dataset (~720 days, 50 members, 5 variables), the function is not scaling well: 45sec for Python vs 365sec for current implementation, using same computer power for both processes.

There is a lot of allocations that I need to take care of!

 @time daily_high = mapCube(maximum_by_index, ds, indims = indims, outdims = outdims; index_list = index_in_cube, showprog = true, max_cache=1.0e9)
  8.219816 seconds (193.34 M allocations: 20.026 GiB, 20.80% gc time)

Is there a better resampling approach?

edit - Current function

function maximum_by_index(xout, xin; index_list = time_to_index)

    xout .= NaN    
    if !all(ismissing, xin)

        for i in eachindex(index_list)
            data = xin[index_list[i]]
            if !all(ismissing, data)               
                    xout[i] = maximum(filter(!ismissing, data))
            end
        end

    end

end 

function dates_builder(x)
    out = Date[]
    for i in eachindex(x)
        push!(out, Date(x[i][1], x[i][2]))
    end

    return out
end
dpabon commented 1 year ago

Hi @Balinus glad that my script was useful.

I'm still learning Julia, then my original implementation is not very efficient in terms of memory allocation.

But I played a bit, optimizing your function:

using EarthDataLab
using YAXArrays
using Statistics
using Zarr
using Dates
using BenchmarkTools
earth_dataset = esdd()

###################################

lst = earth_dataset.land_surface_temperature[time = (Date("2002-05-21"),Date("2011-12-31"))]

# let's estimate the median per month:

time_to_index = getAxis("time", lst)

time_index = yearmonth.(time_to_index)

new_dates = unique(time_index)

index_in_cube = [findall(==(i), time_index) for i in unique(time_index)]

# original function

function maximum_by_index(xout, xin; index_list = time_to_index)

    xout .= NaN    
    if !all(ismissing, xin)

        for i in eachindex(index_list)
            data = xin[index_list[i]]
            if !all(ismissing, data)               
                    xout[i] = maximum(filter(!ismissing, data))
            end
        end

    end

end 

lst

dummy_data =rand(442)

output_1 = zeros(length(index_in_cube))

@btime maximum_by_index(output_1, dummy_data; index_list = index_in_cube)

your function after running twice:

8.738 μs (234 allocations: 21.12 KiB)

your function + some tricks:

function maximum_by_index(xout, xin; index_list = time_to_index)

    xout .= NaN    
    if !all(ismissing, xin)
        for i in eachindex(index_list)
            data = view(xin, index_list[i])
            xout[i] = maximum(skipmissing(data))
        end

    end

end 

@btime maximum_by_index(output_2, dummy_data; index_list = index_in_cube)

produce:

1.546 μs (2 allocations: 32 bytes)

just to double-check:

output_1 == output_2
true

Maybe there is space for even more optimization, but optimizing functions in Julia sometimes is a rabbit's hole.

Balinus commented 1 year ago

Thanks @dpabon !

I will take a shot with your functions and see how it goes, cheers!

Balinus commented 1 year ago

Way better than the initial function!

I added a check on the daily values (i have 2-3 days with allmissing)

function maximum_by_index_view(xout, xin; index_list = time_to_index)

    xout .= NaN
    if !all(ismissing, xin)
        for i in eachindex(index_list)
            data = view(xin, index_list[i])
            if !all(ismissing, data)
                xout[i] = maximum(skipmissing(data))
            end
        end

    end

end 

Benchmark is now (for a subset of the initial data): 420ms vs 120ms. So, still a "speedup" of about 3-4x for Python versus an initial speedup of 8x. Anyway, I think that might be fast enough to continue with the current implementation and profit from julia's flexibility down the road, thanks!

Balinus commented 1 year ago

Is there a way to generalize this function under a "resampling" umbrella? I think extracting data on a general time sample represent a lot of what climate scientist needs to do, like seasonal/daily/monthly statistics (max, min, mean or any arbitrary function like a climate indicator).

dpabon commented 1 year ago

I'm not certain how you are benchmarking Julia vs Python, but be aware that the first time you run the function, Julia compiles the code then it takes a bit longer and the second time should be considered as the real time.

Yes, I can re-write a meta function that runs YAXArrays map cube below, where you can pass the parameters you suggest. But it's going to take some days. I'm planning to release a "YAXArrayToolbox.jl" or "YAXArrayRecipes.jl" (not definite name yet) package to perform different operations and spatial analysis using YAXArrays.

Balinus commented 1 year ago

No worries! I will certainly check your package when it's ready. The package will be very useful I think 😄

About benchmarking, I use BenchmarkTools and the @benchmark macro. For Python, I force the .compute() method to ensure that the actual calculation is done (not 100% it is though!)

dpabon commented 1 year ago

Hi @Balinus,

You can now use the aggregate_time() function in the YAXArraysToolbox.jl package. Any feedback is more than welcome.

image

Balinus commented 1 year ago

Nice! I will take a look!

TabeaW commented 1 year ago

aggregate_time(cube; time_axis = "time", new_resolution = "month", new_time_step=1, fun="mean", p=nothing, skipMissing=true, skipnan=true, showprog=true, max_cache="1GB") leads for me to an error

ERROR: On worker 2:
TypeError: non-boolean (Missing) used in boolean context
Stacktrace:
  [1] #mean_by_index_2#6
    @ ~/.julia/packages/YAXArraysToolbox/mTuzS/src/aggregate_time.jl:102
  [2] innercode
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1123
  [3] innerLoop
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1146
  [4] #107
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:701
  [5] fnew
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:665
  [6] #56
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1016
  [7] #invokelatest#2
    @ ./essentials.jl:816
  [8] invokelatest
    @ ./essentials.jl:813
  [9] #110
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:285
 [10] run_work_thunk
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:70
 [11] macro expansion
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:285 [inlined]
 [12] #109
    @ ./task.jl:514
Stacktrace:
  [1] (::Base.var"#988#990")(x::Task)
    @ Base ./asyncmap.jl:177
  [2] foreach(f::Base.var"#988#990", itr::Vector{Any})
    @ Base ./abstractarray.jl:3073
  [3] maptwice(wrapped_f::Function, chnl::Channel{Any}, worker_tasks::Vector{Any}, c::DiskArrays.GridChunks{2})
    @ Base ./asyncmap.jl:177
  [4] wrap_n_exec_twice
    @ ./asyncmap.jl:153 [inlined]
  [5] #async_usemap#973
    @ ./asyncmap.jl:103 [inlined]
  [6] async_usemap
    @ ./asyncmap.jl:84 [inlined]
  [7] #asyncmap#972
    @ ./asyncmap.jl:81 [inlined]
  [8] asyncmap
    @ ./asyncmap.jl:80 [inlined]
  [9] pmap(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; distributed::Bool, batch_size::Int64, on_error::Nothing, retry_delays::Vector{Any}, retry_check::Nothing)
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/pmap.jl:126
 [10] pmap(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2})
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/pmap.jl:99
 [11] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1015 [inlined]
 [12] macro expansion
    @ ./task.jl:476 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1014 [inlined]
 [14] macro expansion
    @ ./task.jl:476 [inlined]
 [15] progress_map(::Function, ::Vararg{Any}; mapfun::Function, progress::ProgressMeter.Progress, channel_bufflen::Int64, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1007
 [16] #progress_pmap#60
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1032 [inlined]
 [17] progress_pmap
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1032 [inlined]
 [18] pmap_with_data(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; initfunc::Function, progress::ProgressMeter.Progress, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:668
 [19] pmap_with_data(f::Function, c::DiskArrays.GridChunks{2}; initfunc::Function, kwargs::Base.Pairs{Symbol, ProgressMeter.Progress, Tuple{Symbol}, NamedTuple{(:progress,), Tuple{ProgressMeter.Progress}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:673
 [20] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:698
 [21] mapCube(::typeof(YAXArraysToolbox.mean_by_index_2), ::Tuple{YAXArray{Union{Missing, Float32}, 3, DiskArrayTools.CFDiskArray{Float32, 3, Float32, ZArray{Float32, 3, Zarr.BloscCompressor, DirectoryStore}}, Vector{RangeAxis}}}; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Dict{Int64, Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:475
 [22] #mapCube#36
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
 [23] aggregate_time(cube_in::YAXArray{Union{Missing, Float32}, 3, DiskArrayTools.CFDiskArray{Float32, 3, Float32, ZArray{Float32, 3, Zarr.BloscCompressor, DirectoryStore}}, Vector{RangeAxis}}; time_axis::String, new_resolution::String, new_time_step::Int64, fun::String, p::Nothing, skipMissing::Bool, skipnan::Bool, showprog::Bool, max_cache::String)
    @ YAXArraysToolbox ~/.julia/packages/YAXArraysToolbox/mTuzS/src/aggregate_time.jl:569
 [24] top-level scope
    @ REPL[8]:1
dpabon commented 1 year ago

Hi @TabeaW please can open a new issue in the YAXArraysToolbox.jl package?

felixcremer commented 1 year ago

We should update the example to 0.5 and add it to the docs.

TabeaW commented 1 year ago

See maybe https://github.com/JuliaDataCubes/YAXArrays.jl/issues/306 for monthly mean.

dpabon commented 1 year ago

We should update the example to 0.5 and add it to the docs.

Hi Felix,

Which one exactly? The one to compute the mean per month?

felixcremer commented 1 year ago

Yes that one. I tried to update it yesterday but I ran into some Ti issues.

dpabon commented 11 months ago

Hi Felix, here my implementation (already working with YAXArrays v0.5) https://github.com/dpabon/YAXArraysToolbox.jl/blob/migration_v0.5_YAXArrays/src/aggregate_time.jl