JuliaCollections / IterTools.jl

Common functional iterator patterns
Other
152 stars 28 forks source link

a proposal for a sort of partition by #99

Open sprmnt21 opened 1 year ago

sprmnt21 commented 1 year ago

Could it be convenient to have an iterator that is somewhere between groupby and partition? The application of the function refers to the case in which we want to take some consecutive slices of variable dimensions (steps) from an iterator

julia> itr=10:-1:1
10:-1:1

julia> steps=[1,2,3,2]
4-element Vector{Int64}:
 1
 2
 3
 2

julia> collect(partby(itr,steps))
4-element Vector{Tuple{Vararg{Int64}}}:
 (10,)
 (9, 8)
 (7, 6, 5)
 (4, 3)

julia> steps=[1,2,3,5]
4-element Vector{Int64}:
 1
 2
 3
 5

julia> collect(partby(itr,steps))
3-element Vector{Tuple{Vararg{Int64}}}:
 (10,)
 (9, 8)
 (7, 6, 5)

julia> steps=[4,2,3,5]
4-element Vector{Int64}:
 4
 2
 3
 5

julia> collect(partby(itr,steps))
3-element Vector{Tuple{Vararg{Int64}}}:
 (10, 9, 8, 7)
 (6, 5)
 (4, 3, 2)

julia> steps=[4,2,3,1, 5]
5-element Vector{Int64}:
 4
 2
 3
 1
 5

julia> collect(partby(itr,steps))
4-element Vector{Tuple{Vararg{Int64}}}:
 (10, 9, 8, 7)
 (6, 5)
 (4, 3, 2)
 (1,)

julia> steps=[2,3,1, 2,7]
5-element Vector{Int64}:
 2
 3
 1
 2
 7

julia> collect(partby(itr,steps))
4-element Vector{Tuple{Vararg{Int64}}}:
 (10, 9)
 (8, 7, 6)
 (5,)
 (4, 3)

julia> collect(partby(partition(itr,2,1),steps))
4-element Vector{Tuple{Vararg{Tuple{Int64, Int64}}}}:    
 ((10, 9), (9, 8))
 ((8, 7), (7, 6), (6, 5))
 ((5, 4),)
 ((4, 3), (3, 2))

#-------------

struct PartBy{I, S}
    xs::I
    steps::S
end
_length_partby(i,s)= findlast(<=(length(i)), accumulate(+, s))
eltype(::Type{<:PartBy{I,S}}) where {I,S} = Tuple{Vararg{eltype(I)}}# Tuple{eltype(I),Vararg{eltype(I)}} #Vector{eltype(I)}
IteratorSize(::Type{<:PartBy{I,S}}) where {I,S} = HasLength()
length(it::PartBy{I,S}) where {I,S} = _length_partby(it.xs, it.steps)

function partby(xs::I, steps::S) where {I, S}
    if any(<=(0),steps)
        throw(ArgumentError("all steps must be positives."))
    end
    PartBy{I, S}(xs, steps)
end

macro ifsomething(ex)
    quote
        result = $(esc(ex))
        result === nothing && return nothing
        result
    end
end

function iterate(it::PartBy{I, S}, state=nothing) where {I, S}
    if state === nothing
        xs_val, xs_state = @ifsomething iterate(it.xs)
        step_val, step_state = @ifsomething iterate(it.steps)
        result = Vector{eltype(I)}(undef, step_val)
        result[1]=xs_val
        kgo = true
        for i in 2:step_val
            result[i], xs_state = @ifsomething iterate(it.xs, xs_state)
        end
       step_iter = iterate(it.steps, step_state)
        if isnothing(step_iter)
            return (tuple(result...),(false, xs_val, xs_state, step_val, step_state))
        else
            step_val, step_state = step_iter
        end step_val, step_state = @ifsomething iterate(it.steps, step_state)
    else
        (kgo, xs_val, xs_state, step_val, step_state) = state
        kgo || return nothing
        result = Vector{eltype(I)}(undef, step_val)       
        for i in 1:step_val
            result[i], xs_state = @ifsomething iterate(it.xs, xs_state)
        end
        step_iter = iterate(it.steps, step_state)
        if isnothing(step_iter)
            return (tuple(result...),(false, xs_val, xs_state, step_val, step_state))
        else
            step_val, step_state = step_iter
        end
    end
    return (tuple(result...), (kgo,xs_val, xs_state, step_val, step_state))
end
sprmnt21 commented 1 year ago

Consider the following problem. Given a list of strings, find the groups of consecutive strings led by a string starting with "AT".

julia> itr=[randstring("ACTG") for _ in 1:20]
20-element Vector{String}:
 "ATTCCGAG"
 "CCCGTGGT"
 "TCAAGGGT"
 "ATTAGATC"
 "TCTTACAC"
 "TTTCCGCC"
 "TCCGACCG"
 "GTCAGCTA"
 "CATGTTGC"
 "GAGGAACG"
 "GTCAATGC"
 "TACTCATT"
 "ATACTCTA"
 "AATTCACA"
 "AATCATAT"
 "GTATACCT"
 "ATTTTACT"
 "TTCAGAAG"
 "GTTGATGA"
 "GACGGCGG"

julia> steps=diff([findall(startswith("AT"), itr);length(itr)])        
4-element Vector{Int64}:
 3
 9
 4
 3

julia> collect(partby(itr,steps))
4-element Vector{Tuple{Vararg{String}}}:
 ("ATTCCGAG", "CCCGTGGT", "TCAAGGGT")
 ("ATTAGATC", "TCTTACAC", "TTTCCGCC", "TCCGACCG", "GTCAGCTA", "CATGTTGC", "GAGGAACG", "GTCAATGC", "TACTCATT")
 ("ATACTCTA", "AATTCACA", "AATCATAT", "GTATACCT")
 ("ATTTTACT", "TTCAGAAG", "GTTGATGA")

or better

st=findall(startswith(somesubstring), itr)
steps=st[1]!=1 ? diff([1;st;length(itr)+1]) : diff([st;length(itr)+1])
collect(partby(itr,steps))