JuliaData / SplitApplyCombine.jl

Split-apply-combine strategies for Julia
Other
144 stars 15 forks source link

comparison as new parameter of the group function #38

Open sprmnt21 opened 2 years ago

sprmnt21 commented 2 years ago

I would like to ask if the opportunity to have a version of the group function with an additional parameter such as the comparison of the innerjoin function has ever been considered. This possibility / flexibility could be very useful on many occasions. Just to make the idea better (certainly not because he has the audacity to suggest how to possibly develop the thing) I submit a naive version of the function and some application examples. The name, following the lines of the functions groupsum, groupprod, etc. is grouplocal, because groups are made "locally" and not globally.

function grouplocal(groups, valori,comparison)
    I = Tuple{Int,Int} 
    T = eltype(valori) 
    out = Dictionary{I, Vector{T}}()
    g=1
    grpid=(g,1)
    st=groups[1]
    push!(get!(Vector{Int64},out, grpid),valori[1])
    for (grp, value,r) in zip(groups[2:end], valori[2:end], 2:length(groups))
        if comparison(st,grp) 
            push!(get(out, grpid,nothing), value)
        else
            g+=1
            grpid=(g,r)
            st=grp
            push!(get!(Vector{Int64},out, grpid),value)
        end
    end
    return out
end

leading group UPPERCASE

seq=['A','b','c','D','e','f']
uc(x,y)=islowercase(x)!=islowercase(y)  
grouplocal(seq,seq,uc)

julia> grouplocal(seq,seq,uc)
2-element Dictionary{Tuple{Int64, Int64}, Vector{Char}}
 (1, 1) │ ['A', 'b', 'c']
 (2, 4) │ ['D', 'e', 'f']

alternate sequences of odd even numbers

seq1=[2,31,3,43,2,32,3,45,5,3,6,8,54,7,8,6]
p(x,y)=isodd(x)==isodd(y) 
grouplocal(seq1,seq1,p)

julia> grouplocal(seq1,seq1,p)
7-element Dictionary{Tuple{Int64, Int64}, Vector{Int64}}
  (1, 1) │ [2]
  (2, 2) │ [31, 3, 43]
  (3, 5) │ [2, 32]
  (4, 7) │ [3, 45, 5, 3]
 (5, 11) │ [6, 8, 54]
 (6, 14) │ [7]
 (7, 15) │ [8, 6]

difference between contiguous values greater than 2 as separation threshold

seq2=[2,3,5,8,9,11,12,15,16,17,22]
d(x,y)=y<=2             
ds=[0;diff(seq2)]    
grouplocal(ds,seq2,d)

julia> grouplocal(ds,seq2,d)
4-element Dictionary{Tuple{Int64, Int64}, Vector{Int64}}
  (1, 1) │ [2, 3, 5]
  (2, 4) │ [8, 9, 11, 12]
  (3, 8) │ [15, 16, 17]
 (4, 11) │ [22]

leading group has the same first 4 characters

nv=["name1","val11","val12","val13","name2","val21","val22"]
cmp(x,y)=x[1:4]!=y[1:4]  
grouplocal(nv,nv,cmp)

julia> grouplocal(nv,nv,cmp)
2-element Dictionary{Tuple{Int64, Int64}, Vector{String}}
 (1, 1) │ ["name1", "val11", "val12", "val13"]
 (2, 5) │ ["name2", "val21", "val22"]

function grouplocalview1(groups, valori,comparison)
    I = UnitRange{Int64}
    out = Vector{I}()
    l=1
    grpid=l:l
    st=groups[1]
    push!(out,grpid)
    for (grp, i) in zip(groups[2:end], 2:length(groups))
        if !comparison(st,grp) 
            out[end]=l:i-1
            l=i
            grpid=l:l
            st=grp
            push!(out,grpid)
        end
        out[end]=l:i
    end
    return getindex.([valori],out)
end
andyferris commented 2 years ago

So the "local" semantic here is to break up "runs"?

I think there is definitely a space for this. There is an operation that generally goes by the name partition which generally behaves like group but the groups are assumed to be contiguous. It should be possible to somehow pass in the "previous group key" or something like that into a comparison function.

It's worth considering what the semantics of the keys are for dictionaries and other iterable data structures (the r part).

Another idea - you could just return an array of (sub?) arrays, for example,