tidyverse / dtplyr

Data table backend for dplyr
https://dtplyr.tidyverse.org
Other
670 stars 57 forks source link

Using `TRUE` in `case_when` causes error if output vector length does not equal group size .N #339

Open eutwt opened 2 years ago

eutwt commented 2 years ago

Originally posted by @KesterJ in https://github.com/tidyverse/dtplyr/issues/300#issuecomment-1036179822

I've encountered a version of this issue that doesn't involve &&, and where group_by() is called after lazy_dt(). Reprex below:

library(dplyr, warn.conflicts = FALSE)
library(dtplyr, warn.conflicts = FALSE)

options(dplyr.summarise.inform = FALSE)

loans <- tibble(
  borrower_id = c(1,1,1,1,2,2),
  loan_id = c("A", "A", "B", "B", "C", "C"),
  year = c(2020, 2021, 2020, 2021, 2020, 2021),
  repayments = c(0, 0, 0, 200, 150, 50)
)

#In dplyr (works)
loans %>%
  group_by(borrower_id, year) %>%
  summarise(
    status = case_when(any(repayments > 0) ~ "Made repayments",
                       TRUE ~ "Did not make any repayments")
  ) %>%
  ungroup()
#> # A tibble: 4 x 3
#>   borrower_id  year status                     
#>         <dbl> <dbl> <chr>                      
#> 1           1  2020 Did not make any repayments
#> 2           1  2021 Made repayments            
#> 3           2  2020 Made repayments            
#> 4           2  2021 Made repayments

#In dtplyr (does not work)
loans %>%
  lazy_dt() %>%
  group_by(borrower_id, year) %>%
  summarise(
    status = case_when(any(repayments > 0) ~ "Made repayments",
                       TRUE ~ "Did not make any repayments")
  ) %>%
  ungroup() %>%
  as_tibble()
#> Error in fcase(any(repayments > 0), "Made repayments", rep(TRUE, .N), : Argument #3 has a different length than argument #1. Please make sure all logical conditions have the same length.

#In dtplyr with different grouping that includes only one row per group (works)
loans %>%
  lazy_dt() %>%
  group_by(loan_id, year) %>%
  summarise(
    status = case_when(any(repayments > 0) ~ "Made repayments",
                       TRUE ~ "Did not make any repayments")
  ) %>%
  ungroup() %>%
  as_tibble()
#> # A tibble: 6 x 3
#>   loan_id  year status                     
#>   <chr>   <dbl> <chr>                      
#> 1 A        2020 Did not make any repayments
#> 2 A        2021 Did not make any repayments
#> 3 B        2020 Did not make any repayments
#> 4 B        2021 Made repayments            
#> 5 C        2020 Made repayments            
#> 6 C        2021 Made repayments

Created on 2022-02-11 by the reprex package (v2.0.1)

eutwt commented 2 years ago

I think the only way to address this would be to assign the first argument and its length to variables, then pass them to fcase with the TRUE/Ts replicated the right number of times. But, that seems like a bad idea

library(dplyr, warn.conflicts = FALSE)
library(dtplyr, warn.conflicts = FALSE)
options(dplyr.summarise.inform = FALSE)

loans <- tibble(
  borrower_id = c(1,1,1,1,2,2),
  loan_id = c("A", "A", "B", "B", "C", "C"),
  year = c(2020, 2021, 2020, 2021, 2020, 2021),
  repayments = c(0, 0, 0, 200, 150, 50)
)

dtp_out <- 
  loans %>%
    lazy_dt() %>%
    group_by(borrower_id, year) %>%
    summarise(
      status = case_when(any(repayments > 0) ~ "Made repayments",
                         TRUE ~ "Did not make any repayments")
    ) 

dtp_out %>%
  ungroup() %>%
  as_tibble()
#> # A tibble: 4 × 3
#>   borrower_id  year status                     
#>         <dbl> <dbl> <chr>                      
#> 1           1  2020 Did not make any repayments
#> 2           1  2021 Made repayments            
#> 3           2  2020 Made repayments            
#> 4           2  2021 Made repayments

dtp_out %>% 
  show_query()
#> `_DT1`[, .(status = local({
#>     .dtp_case_arg1 <- any(repayments > 0)
#>     .dtp_case_len <- length(.dtp_case_arg1)
#>     fcase(.dtp_case_arg1, "Made repayments", rep(TRUE, .dtp_case_len), 
#>         "Did not make any repayments")
#> })), keyby = .(borrower_id, year)]

Created on 2022-02-12 by the reprex package (v2.0.1)