duckdblabs / duckplyr

A drop-in replacement for dplyr, powered by DuckDB for performance.
https://duckdblabs.github.io/duckplyr/
Other
222 stars 12 forks source link

Implement `sum(na.rm = FALSE)` #130

Open andreranza opened 3 months ago

andreranza commented 3 months ago

Skip to: https://github.com/duckdblabs/duckplyr/issues/130#issuecomment-2015328003

library(tibble)
suppressPackageStartupMessages(library(duckplyr))

df1 <- tibble(
  type = c("A", "B", "A", "B"),
  amt = c(1638210122L, 1456061134L, 1571660787L, 1571660787L)
) 

try(
  df1 |> 
    as_duckplyr_df() |>
    summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)  
)
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(INTEGER, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(INTEGER, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}

df2 <- tibble(
  type = c("A", "B", "A", "B"),
  amt = c(1638210122.2323, 1456061134.223, 1571660787.2321, 1571660787.2112)
)

try(
  df2 |> 
    as_duckplyr_df() |>
    summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)  
)
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}

Created on 2024-03-22 with reprex v2.1.0

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.3.2 (2023-10-31) #> os macOS Sonoma 14.3.1 #> system x86_64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz Europe/Rome #> date 2024-03-22 #> pandoc 3.1.8 @ /usr/local/bin/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.0) #> collections 0.3.7 2023-01-05 [1] CRAN (R 4.3.0) #> curl 5.2.0 2023-12-08 [1] CRAN (R 4.3.0) #> DBI 1.2.2 2024-02-16 [1] CRAN (R 4.3.2) #> digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.0) #> dplyr 1.1.4 2023-11-17 [1] CRAN (R 4.3.0) #> duckdb 0.10.0 2024-03-13 [1] CRAN (R 4.3.2) #> duckplyr * 0.3.2 2024-03-17 [1] CRAN (R 4.3.2) #> evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.0) #> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.0) #> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0) #> fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0) #> glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.0) #> htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.0) #> knitr 1.45 2023-10-30 [1] CRAN (R 4.3.0) #> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0) #> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0) #> purrr 1.0.2 2023-08-10 [1] CRAN (R 4.3.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.3.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.3.0) #> R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.3.2) #> R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.3.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0) #> reprex 2.1.0 2024-01-11 [1] CRAN (R 4.3.0) #> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.0) #> rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.0) #> rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.0) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0) #> styler 1.10.2 2023-08-29 [1] CRAN (R 4.3.0) #> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0) #> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.2) #> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.0) #> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.0) #> withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.0) #> xfun 0.42 2024-02-08 [1] CRAN (R 4.3.2) #> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.0) #> #> ────────────────────────────────────────────────────────────────────────────── ```
andreranza commented 3 months ago

Actually, "large numbers" are not the problem here, rather na.rm = TRUE in sum(). Probably needs to be harmonized with dplyr's behaviour.

library(tibble)

df1 <- tibble(
  type = c("A", "B", "A", "B"),
  amt = c(20, 1, NA, 1)
)

try(
  df1 |>
    duckplyr::as_duckplyr_df() |>
    duckplyr::summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)  
)
#> No duckplyr fallback reports ready for upload.
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}

# NAs are dropped by default
df1 |>
  duckplyr::as_duckplyr_df() |>
  duckplyr::summarise(amt_tot = sum(amt), .by = type)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Aggregate ["type", sum(amt)]
#>   r_dataframe_scan(0x7fd8e7bd4048)
#> 
#> ---------------------
#> -- Result Columns  --
#> ---------------------
#> - type (VARCHAR)
#> - amt_tot (DOUBLE)
#> 
#> # A tibble: 2 × 2
#>   type  amt_tot
#>   <chr>   <dbl>
#> 1 A          20
#> 2 B           2

df1 |>
  dplyr::summarise(amt_tot = sum(amt), .by = type)
#> # A tibble: 2 × 2
#>   type  amt_tot
#>   <chr>   <dbl>
#> 1 A          NA
#> 2 B           2

Created on 2024-03-22 with reprex v2.1.0

andreranza commented 3 months ago

@krlmlr, sorry for the follow-up. This latest reprex should make it clear.

I wonder whether duckplyr should beahave similarly to what I get from a connection (i.e. throwing a warning) in case of NA. As it is now we see three different cases:

df1 <- tibble::tibble( type = c("A", "B", "A", "B"), amt = c(20, 1, NA, 1) )

con <- dbConnect(duckdb(), read_only = FALSE) dbWriteTable(con, "tbl1", df1)

tbl1 <- tbl("tbl1", src = con)

from connection I get a warning which reminds me of the known "db feature"

tbl1 |> summarise(amt = sum(amt), .by = type)

> Warning: Missing values are always removed in SQL aggregation functions.

> Use na.rm = TRUE to silence this warning

> This warning is displayed once every 8 hours.

> # Source: SQL [2 x 2]

> # Database: DuckDB v0.10.0 [root@Darwin 23.3.0:R 4.3.2/:memory:]

> type amt

>

> 1 A 20

> 2 B 2

make it silent

tbl1 |> summarise(amt = sum(amt, na.rm = TRUE), .by = type)

> # Source: SQL [2 x 2]

> # Database: DuckDB v0.10.0 [root@Darwin 23.3.0:R 4.3.2/:memory:]

> type amt

>

> 1 A 20

> 2 B 2

withr::with_envvar( new = c(DUCKPLYR_FORCE = "TRUE"), code = { tbl1 |> collect() |> as_duckplyr_df() |> summarise(amt = sum(amt, na.rm = TRUE), .by = type) } )

> Error: {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}

tbl1 |> collect() |> as_duckplyr_df() |> duckplyr::summarise(amt = sum(amt), .by = type)

> materializing:

> ---------------------

> --- Relation Tree ---

> ---------------------

> Aggregate ["type", sum(amt)]

> r_dataframe_scan(0x7fd1495425c8)

>

> ---------------------

> -- Result Columns --

> ---------------------

> - type (VARCHAR)

> - amt (DOUBLE)

>

> # A tibble: 2 × 2

> type amt

>

> 1 A 20

> 2 B 2

df1 |> dplyr::summarise(amt = sum(amt), .by = type)

> # A tibble: 2 × 2

> type amt

>

> 1 A NA

> 2 B 2

expectations:

- duckplyr 0.3.2 has the same behaviour of dplyr

(very unlikely it will happen since there's always a db under the hood)

- duckplyr 0.3.2 throws a warning as done from connection, which is not the case right now

- duckplyr 0.3.2 supports sum(..., na.rm = TRUE), does not throw an error and

na.rm = TRUE is used to silence the warning


<sup>Created on 2024-03-22 with [reprex v2.1.0](https://reprex.tidyverse.org)</sup>

<details style="margin-bottom:10px;">
<summary>
Session info
</summary>

``` r
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.2 (2023-10-31)
#>  os       macOS Sonoma 14.3.1
#>  system   x86_64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Europe/Rome
#>  date     2024-03-22
#>  pandoc   3.1.8 @ /usr/local/bin/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  blob          1.2.4   2023-03-17 [1] CRAN (R 4.3.0)
#>  cli           3.6.2   2023-12-11 [1] CRAN (R 4.3.0)
#>  collections   0.3.7   2023-01-05 [1] CRAN (R 4.3.0)
#>  curl          5.2.0   2023-12-08 [1] CRAN (R 4.3.0)
#>  DBI         * 1.2.2   2024-02-16 [1] CRAN (R 4.3.2)
#>  dbplyr        2.4.0   2023-10-26 [1] CRAN (R 4.3.0)
#>  digest        0.6.34  2024-01-11 [1] CRAN (R 4.3.0)
#>  dplyr       * 1.1.4   2023-11-17 [1] CRAN (R 4.3.0)
#>  duckdb      * 0.10.0  2024-03-13 [1] CRAN (R 4.3.2)
#>  duckplyr    * 0.3.2   2024-03-17 [1] CRAN (R 4.3.2)
#>  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.0)
#>  fansi         1.0.6   2023-12-08 [1] CRAN (R 4.3.0)
#>  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
#>  fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.0)
#>  generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
#>  glue          1.7.0   2024-01-09 [1] CRAN (R 4.3.0)
#>  htmltools     0.5.7   2023-11-03 [1] CRAN (R 4.3.0)
#>  knitr         1.45    2023-10-30 [1] CRAN (R 4.3.0)
#>  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.0)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
#>  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
#>  purrr         1.0.2   2023-08-10 [1] CRAN (R 4.3.0)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.3.0)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.3.0)
#>  R.oo          1.26.0  2024-01-24 [1] CRAN (R 4.3.2)
#>  R.utils       2.12.3  2023-11-18 [1] CRAN (R 4.3.0)
#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
#>  reprex        2.1.0   2024-01-11 [1] CRAN (R 4.3.0)
#>  rlang         1.1.3   2024-01-10 [1] CRAN (R 4.3.0)
#>  rmarkdown     2.25    2023-09-18 [1] CRAN (R 4.3.0)
#>  rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.0)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
#>  styler        1.10.2  2023-08-29 [1] CRAN (R 4.3.0)
#>  tibble        3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
#>  tidyselect    1.2.1   2024-03-11 [1] CRAN (R 4.3.2)
#>  utf8          1.2.4   2023-10-22 [1] CRAN (R 4.3.0)
#>  vctrs         0.6.5   2023-12-01 [1] CRAN (R 4.3.0)
#>  withr         3.0.0   2024-01-16 [1] CRAN (R 4.3.0)
#>  xfun          0.42    2024-02-08 [1] CRAN (R 4.3.2)
#>  yaml          2.3.8   2023-12-11 [1] CRAN (R 4.3.0)
#> 
#> 
#> ──────────────────────────────────────────────────────────────────────────────

krlmlr commented 3 months ago

Good catch. We should fall back to base R for sum(na.rm = FALSE), which is the default, and accept na.rm = TRUE . Perhaps we can also fix this straight away by implementing an extension function.

All other aggregate functions with an na.rm argument are affected too.

CC @romainfrancois.