fslaborg / Deedle

Easy to use .NET library for data and time series manipulation and for scientific programming
http://fslab.org/Deedle/
BSD 2-Clause "Simplified" License
924 stars 196 forks source link

Suggestion to distinct rows by specified columns #558

Open ingted opened 1 year ago

ingted commented 1 year ago

https://stackoverflow.com/questions/70985428/deedle-distinct-by-column/75897557#75897557

We sometimes do "select distinct col1, col2, col3 from xxx" in RDBMS, and it seems like we don't have an easy to use API in Deedle?

How about this?

#r "nuget: Deedle, 3.0.0"
#load "Deedle.fsx"
open Deedle
let inline distinctFrame (frame: Frame<'R, 'C>) (keys: 'C seq) (distColId:'C) =

    let idxSource = 
        frame
        |> Frame.mapRows (fun (i:'R) r ->
            i, keys |> Seq.map r.TryGet |> Seq.toArray
        )
        |> fun s -> s.Values

    let idx = 
        idxSource
        |> Seq.groupBy (fun (_, g) -> g)
        |> Seq.map (fun g -> 
            let (idx, _) = snd g |> Seq.item 0
            idx
            )
        |> Seq.distinct
        |> fun s -> Frame([distColId], [Series(s, s)])//"____distinctIdx____"

    let fmj = frame.Join(idx, kind=JoinKind.Inner)
    fmj.Columns[fmj.ColumnKeys |> Seq.filter (fun v -> v <> distColId)]

open System.IO

let data = "A;B\na;1\nb;2\nb;2\nc;3"

let bytes = System.Text.Encoding.UTF8.GetBytes data
let stream =  new MemoryStream( bytes )

let df= 
    Frame.ReadCsv(
        stream = stream,
        separators = ";",
        hasHeaders = true
    )

distinctFrame df ["A";"B"] "____distinctIdx____"
ingted commented 1 year ago

The result:

val data: string = "A;B
a;1
b;2
b;2
c;3"
val bytes: byte array =
  [|65uy; 59uy; 66uy; 10uy; 97uy; 59uy; 49uy; 10uy; 98uy; 59uy; 50uy; 10uy;
    98uy; 59uy; 50uy; 10uy; 99uy; 59uy; 51uy|]
val stream: IO.MemoryStream
val df: Frame<int,string> =

     A B 
0 -> a 1 
1 -> b 2 
2 -> b 2 
3 -> c 3 

4 rows x 2 columns
0 missing values
val it: Frame<int,string> =

     A B 
0 -> a 1 
1 -> b 2 
3 -> c 3 

3 rows x 2 columns
0 missing values
ingted commented 6 months ago
module Frame = 
    let inline distinctFrame (keys: 'C seq) (distColId:'C) (frame: Frame<'R, 'C>) =

        let idxSource = 
            frame
            |> Frame.mapRows (fun (i:'R) r ->
                i, keys |> Seq.map r.TryGet |> Seq.toArray
            )
            |> fun s -> s.Values

        let idx = 
            idxSource
            |> Seq.groupBy (fun (_, g) -> g)
            |> Seq.map (fun g -> 
                let (idx, _) = snd g |> Seq.item 0
                idx
                )
            |> Seq.distinct
            |> fun s -> Frame([distColId], [Series(s, s)])//"____distinctIdx____"

        let fmj = frame.Join(idx, kind=JoinKind.Inner)
        fmj.Columns.[fmj.ColumnKeys |> Seq.filter (fun v -> v <> distColId)]

    let inline orderBy (mappingOrderKeys: ObjectSeries<'C> -> int) (distColId:'C) (frame: Frame<'R, 'C>) =        
        let idxSource = 
            frame
            |> Frame.mapRows (fun (i:'R) r ->
                mappingOrderKeys r
            )
            |> fun s -> Frame([distColId], [s])//"____distinctIdx____"

        frame.Join(idxSource, kind=JoinKind.Inner)
        |> Frame.sortRows distColId

    let inline orderByCols (keys: 'C seq) (distColId:'C) (frame: Frame<'R, 'C>) =
        let colValues = 
            frame
            |> Frame.mapRowValues (fun row ->
                let values = 
                    keys
                    |> Seq.map (fun k ->
                        row.[k]
                    )
                    |> Seq.toArray
                values |> Array.map (fun v -> v :?> IComparable)
            )

        let orderKeys =
            colValues.Values
            |> Seq.distinct
            |> Seq.sort
            |> Seq.mapi (fun i v -> v, i)
            |> Map

        let ordered =
            colValues
            |> Series.map (fun c v -> orderKeys.[v])

        let appendCol =
            Frame.ofColumns [distColId, ordered]

        frame.Join(appendCol, kind=JoinKind.Inner)
        |> Frame.sortRows distColId

orderBy & orderByCols functionality