TidierOrg / TidierDB.jl

Tidier database analysis in Julia, modeled after the dbplyr R package.
MIT License
46 stars 3 forks source link

Allows joins be able to accept TidierDB queries as the second table argument. #66

Closed drizk1 closed 1 month ago

drizk1 commented 1 month ago

So i accidentally discovered how to do this working on union. I have shown an example below. The issue is that it means that the table name for the joining table (if it is not a query) would need to be written as "table_name" or :table_name as opposed to just table_name - this would be a small breaking change, but possibly worth it. Should we go down this road?

df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], 
                        groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], 
                        value = repeat(1:5, 2), 
                        percent = 0.1:0.1:1.0);

df2 = DataFrame(id2 = ["AA", "AC", "AE", "AG", "AI", "AK", "AM"],
                category = ["X", "Y", "X", "Y", "X", "Y", "X"],
                score = [88, 92, 77, 83, 95, 68, 74]);
db = connect(duckdb());
copy_to(db, df, "df_mem");
copy_to(db, df2, "df_join");

df_mem = db_table(db, :df_mem);

table1_query =  @chain db_table(db, :df_join) @mutate(score1=100) @filter(score >90)

@chain t(df_mem) begin
         @full_join(t(table1_query) , id2, id)
         @collect
end
10×8 DataFrame
 Row │ id      groups  value  percent  id2      category  score    score1  
     │ String  String  Int64  Float64  String?  String?   Int64?   Int32?  
─────┼─────────────────────────────────────────────────────────────────────
   1 │ AC      bb          3      0.3  AC       Y              92      100
   2 │ AI      bb          4      0.9  AI       X              95      100
   3 │ AA      bb          1      0.1  missing  missing   missing  missing 
   4 │ AB      aa          2      0.2  missing  missing   missing  missing 
   5 │ AD      aa          4      0.4  missing  missing   missing  missing 
   6 │ AE      bb          5      0.5  missing  missing   missing  missing 
   7 │ AF      aa          1      0.6  missing  missing   missing  missing 
   8 │ AG      bb          2      0.7  missing  missing   missing  missing 
   9 │ AH      aa          3      0.8  missing  missing   missing  missing 
  10 │ AJ      aa          5      1.0  missing  missing   missing  missing 

Edit: I also need to add documentation that you all the joins support the joining arguments ability to be col - 1 etc. another feature I just realized

drizk1 commented 1 month ago

a slightly more involved example.

mtcars = db_table(db, mtcars_csv)

query = @chain t(mtcars) begin
    @group_by cyl
    @summarize begin
        across(mpg, (mean, minimum, maximum))
        num_cars = n()
        end
    @mutate begin
        efficiency = case_when(
            mean_mpg >= 25, "High",
            mean_mpg >= 15, "Moderate",
            "Low" )
      end
end;

query2 = @chain t(mtcars) @filter(mpg>20) @mutate(hp = hp *4);

@chain t(query) begin
    @left_join(t(query2), cyl, cyl)
    @group_by(efficiency)
    @summarize(avg_hp = mean(hp))
    @mutate(hp3 = avg_hp+1)
    @aside @show_query _
    @collect
end
2×3 DataFrame
 Row │ efficiency  avg_hp   hp3     
     │ String      Float64  Float64 
─────┼──────────────────────────────
   1 │ Moderate    440.0    441.0
   2 │ High        330.545  331.545