WIP: PoC for storing pure polars calls in attributes

etiennebacher commented 3 months ago

Close #96

Example:

library(tidypolars)
library(dplyr, warn.conflicts = FALSE)

iris |> 
  as_polars_df() |> 
  select(starts_with(c("Sep", "Pet"))) |> 
  mutate(x = (Petal.Length / Petal.Width) > 3) |> 
  filter(between(Sepal.Length, 4.5, 6.5), Petal.Length < 5) |> 
  head() |> 
  show_query()
#> Pure polars expression:
#> 
#> <data>$
#>   select(c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"))$
#>   with_columns(x = p$col("Petal.Length")$div(p$col("Petal.Width"))$gt(3))$
#>   filter(p$col("Sepal.Length")$is_between(lower_bound = 4.5, upper_bound = 6.5, closed = "both"), p$col("Petal.Length")$lt(5))$
#>   head(n = 6L)

TODO:

[ ] wrap up, pass tests
[ ] prepare snapshot tests
[ ] refactor, add comments
[ ] potentially write a vignette for future self + interested readers
[ ] wrong query for bind_cols_polars()
[x] class of original data shouldn't change

library(dplyr, warn.conflicts = FALSE)
library(tidypolars)

pl_relig_income <- as_polars_df(tidyr::relig_income)

class(pl_relig_income)
#> [1] "RPolarsDataFrame"

x <- pl_relig_income |>
  arrange(drat)

class(pl_relig_income)
#> [1] "tidypolars"       "RPolarsDataFrame"

eitsupi commented 3 months ago

It may helpful the tidyquery package's code. https://github.com/ianmcook/tidyquery

The tidyquery package generates a dplyr query string from an SQL string and executes the actual dplyr query, but I believe the query to be displayed and the query to be executed were constructed separately (rather than being evaluated as is after the string is assembled)

etiennebacher commented 3 months ago

Need to check how this affects performance. Building on https://stackoverflow.com/questions/78262759

library(tidypolars)
library(dplyr, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)

pl_relig_income <- as_polars_df(tidyr::relig_income)

pl_relig_income |>
  pivot_longer(!religion, names_to = "income", values_to = "count") |> 
  drop_na() |> 
  arrange(religion, count) |> 
  show_query()
#> Pure polars expression:
#> 
#> <data>$
#>   melt(id_vars = "religion", value_vars = c("<$10k", "$10-20k", "$20-30k", "$30-40k", "$40-50k", "$50-75k", "$75-100k", "$100-150k", ">150k", "Don't know/refused"), variable_name = "income", value_name = "count")$sort("religion")$
#>   drop_nulls(character(0))$
#>   sort(c("religion", "count"), descending = c(FALSE, FALSE))

library(tidypolars)
library(dplyr, warn.conflicts = FALSE)

iris |> 
  as_polars_df() |> 
  filter(Sepal.Length > 5) |> 
  show_query()
#> Pure polars expression:
#> 
#> <data>$
#>   filter(p$col("Sepal.Length")$gt(5))

iris |> 
  as_polars_df() |> 
  filter(Species %in% c("setosa", "virginica")) |> 
  show_query()
#> Pure polars expression:
#> 
#> <data>$
#>   filter(p$col("Species")$is_in(p$lit(c("setosa", "virginica"))))

iris |> 
  as_polars_df() |> 
  filter(Sepal.Length > 5, Species %in% c("setosa", "virginica")) |> 
  show_query()
#> Pure polars expression:
#> 
#> <data>$
#>   filter(p$col("Sepal.Length")$gt(5), p$col("Species")$is_in(p$lit(c("setosa", "virginica"))))

etiennebacher / tidypolars

WIP: PoC for storing pure polars calls in attributes #103