rafaqz / DimensionalData.jl

Named dimensions and indexing for julia arrays and other data
https://rafaqz.github.io/DimensionalData.jl/stable/
MIT License
271 stars 38 forks source link

Generalizing the stats functions #504

Open sethaxen opened 1 year ago

sethaxen commented 1 year ago

Would you be interested in turning the implementation defined in https://github.com/rafaqz/DimensionalData.jl/blob/eb222fbb232bd9a162d44128ebf17a6213430bba/src/stack/methods.jl#L172-L183 into a utility function? I frequently find myself wanting to map a reduction to a scalar across some dimensions across all layers in a stack. Another generalization is that I sometimes want to map a reduction to a Tuple or NamedTuple across the layers but then turn the output into a new dimension, with me providing the new dimension name. This would for example be useful for defining Statistics.quantile(::AbstractDimStack). Would something like this be useful?

rafaqz commented 1 year ago

Not sure I 100% understand the use case but it sounds good, I'll get it when I read the PR :)

felixcremer commented 1 year ago

This seems to be a bit similar in functionality to the YAXArrays.mapCube function. https://juliadatacubes.github.io/YAXArrays.jl/dev/api/#YAXArrays.DAT.mapCube-Tuple%7BFunction,%20Tuple,%20Vararg%7BAny%7D%7D

With which you can specify the input and output dimensions and get a reduction across them. mapCube should also work on YAXArrays.Datasets but we could make this also work on DimStacks, but for that I would need to look into the difference between DimStack and YAXArrays.Dataset

rafaqz commented 1 year ago

Ah interesting, was wondering what mapCube was doing in your examples.

@sethaxen it looks like that function could be improved as well. I think passing Int dims through to the DimArray layers on line 176 is actually a bug, it should instead get the Nth dimension from the stack and use that, or maybe just not allow Int at all.

sethaxen commented 1 year ago

Not sure I 100% understand the use case but it sounds good, I'll get it when I read the PR :)

Here's a very rough implementation to demonstrate how it could be used:

function reduce_over_dims(f, stack::DimensionalData.AbstractDimStack; dims=:, new_dim=nothing)
    layers = map(DimensionalData.layers(stack)) do var
        if dims isa Union{Colon,Int}
            return f(var; dims)
        end
        kept_dims = Dimensions.otherdims(var, dims)
        if isempty(kept_dims)
            y = DimensionalData.DimArray(fill(f(var)), ())
        else
            y = map(f, eachslice(var; dims=kept_dims))
        end
        if new_dim !== nothing && eltype(y) <: Union{NamedTuple,Tuple}
            ks = _eltype_keys(y)
            subarrs = map(ks) do k
                map(Base.Fix2(getindex, k), y)
            end
            return cat(subarrs...; dims=new_dim)
        else
            return y
        end
    end
    dims isa Union{Colon,Int} && return layers
    layerdims_new = map(Dimensions.dims, layers)
    refdims_new = Dimensions.combinedims((
        Dimensions.refdims(stack)..., Dimensions.dims(stack, _astuple(dims))...
    ))
    return DimensionalData.rebuild(
        stack;
        data=map(DimensionalData.data, layers),
        layerdims=layerdims_new,
        dims=Dimensions.combinedims(layerdims_new...),
        refdims=refdims_new,
        layermetadata=map(Dimensions.metadata, layers),
        metadata=DimensionalData.NoMetadata(),
    )
end

_astuple(x) = (x,)
_astuple(x::Tuple) = x

_eltype_keys(::AbstractArray{<:NamedTuple{K}}) where {K} = ntuple(identity, length(K))
_eltype_keys(::AbstractArray{<:Tuple{Vararg{<:Any,N}}}) where {N} = ntuple(identity, N)

mymean(s::AbstractDimStack; dims=:) = reduce_over_dims(mean, s; dims)
function myquantile(s::AbstractDimStack, p; dims=:, new_dim=:prob)
    reduce_over_dims(Base.Fix2(quantile, p), s; dims, new_dim)
end

Now we test it

julia> s = DimStack((; x=DimArray(randn(100, 2), (:a, :b)), y=DimArray(randn(100, 3, 4), (:a, :c, :d))))
DimStack with dimensions: Dim{:a}, Dim{:b}, Dim{:c}, Dim{:d}
and 2 layers:
  :x Float64 dims: Dim{:a}, Dim{:b} (100×2)
  :y Float64 dims: Dim{:a}, Dim{:c}, Dim{:d} (100×3×4)

julia> mymean(s; dims=:a)
DimStack with dimensions: Dim{:b}, Dim{:c}, Dim{:d}
and 2 layers:
  :x Float64 dims: Dim{:b} (2)
  :y Float64 dims: Dim{:c}, Dim{:d} (3×4)

julia> myquantile(s, (0.05, 0.5, 0.95); dims=:a)
DimStack with dimensions: Dim{:b}, Dim{:prob}, Dim{:c}, Dim{:d}
and 2 layers:
  :x Float64 dims: Dim{:b}, Dim{:prob} (2×3)
  :y Float64 dims: Dim{:c}, Dim{:d}, Dim{:prob} (3×4×3)

This seems to be a bit similar in functionality to the YAXArrays.mapCube function.

Yes indeed, that seems very similar. And if the user provides outDims but the output type of f is not an array but some other collection like a NamedTuple, will it be converted to an array?

In my case I would either put this utility function here or in my downstream package, since YAXArrays would be too heavy a dependency for me.

rafaqz commented 1 year ago

Ok that does seem pretty useful.

@sethaxen YAXArrays also depends on DimensionalData.jl so maybe we can even all use this if it works for everyone? Although I guess what YAXArrays really adds here is DiskArrays.jl based chunking and parallelism to do this for huge arrays.