tidyverse / dtplyr

Data table backend for dplyr
https://dtplyr.tidyverse.org
Other
670 stars 57 forks source link

Custom suffixes don't work in `dplyr::full_join` with `dtplyr::lazy_dt` converted tibbles #382

Closed caparks2 closed 2 years ago

caparks2 commented 2 years ago

Custom suffixes given to the suffix argument in dplyr::full_join fail to be implemented in the returned data frame when the x and y input data frames are first converted with dtplyr::lazy_dt. This appears to be related to closed Issue #40, which was fixed in Pull Request #48.

# Mini example data sets
df.CD4 <- tibble::tribble(
  ~cdr3_amino_acid, ~v_resolved, ~j_resolved, ~reads,
  "CASRATSGRAYEQYF", "TRBV3", "TRBJ2-7", 10,
  "CASSPTPGQVNYGYTF", "TRBV4-1", "TRBJ1-2", 4,
  "CASSLGLAGLYEQYF", "TRBV5-1", "TRBJ2-7", 3,
  "CASRPTGTGELFF", "TRBV11-3", "TRBJ2-2", 3,
  "CASSDLQGEGEQYF", "TRBV2-1", "TRBJ2-7", 2,
  "CSARFPSVAVYNEQFF", "TRBV20", "TRBJ2-1", 2,
  "CASRDRGSYGYTF", "TRBV2-1", "TRBJ1-2", 1,
  "CASSAGTSGGSGDTQYF", "TRBV9-1", "TRBJ2-3", 1,
  "CASSGQGAQEQYF", "TRBV6-6", "TRBJ2-7", 1,
  "CASSSTSAGTDTQYF", "TRBV12", "TRBJ2-3", 1
)

df.CD8 <- tibble::tribble(
  ~cdr3_amino_acid, ~v_resolved, ~j_resolved, ~reads,
  "CASRPTGTGELFF", "TRBV11-3", "TRBJ2-2", 100,
  "CASRATSGRAYEQYF", "TRBV3", "TRBJ2-7", 80,
  "CASSDLQGEGEQYF", "TRBV2-1", "TRBJ2-7", 30,
  "CASSPTPGQVNYGYTF", "TRBV4-1", "TRBJ1-2", 10,
  "CASSLGLAGLYEQYF", "TRBV5-1", "TRBJ2-7", 4,
  "CSARFPSVAVYNEQFF", "TRBV20", "TRBJ2-1", 2,
  "CASSFDRDGYGYTF", "TRBV7-2", "TRBJ1-2", 2,
  "CASSEEVWGYTF", "TRBV6-1", "TRBJ1-2", 1,
  "CASTQSNTGELFF", "TRBV6-1", "TRBJ2-2", 1,
  "CASSPLTGPGREQFF", "TRBV18-1", "TRBJ2-1", 1
)

# use data.table back-end for dplyr
dt.CD4 <- dtplyr::lazy_dt(
  x = df.CD4,
  key_by = c("cdr3_amino_acid", "v_resolved", "j_resolved")
)

dt.CD8 <- dtplyr::lazy_dt(
  x = df.CD8,
  key_by = c("cdr3_amino_acid", "v_resolved", "j_resolved")
)

# inner_join
dt.inner <- dplyr::inner_join(
  x = dt.CD4, y = dt.CD8,
  by = c("cdr3_amino_acid", "v_resolved", "j_resolved"),
  suffix = c(".CD4", ".CD8")
)

# inner_join works with custom suffixes
dt.inner
#> Source: local data table [6 x 5]
#> Call:   setnames(`_DT1`[`_DT2`, on = .(cdr3_amino_acid, v_resolved, j_resolved), 
#>     nomatch = NULL, allow.cartesian = TRUE], c("reads", "i.reads"
#> ), c("reads.CD4", "reads.CD8"))
#> 
#>   cdr3_amino_acid  v_resolved j_resolved reads.CD4 reads.CD8
#>   <chr>            <chr>      <chr>          <dbl>     <dbl>
#> 1 CASRATSGRAYEQYF  TRBV3      TRBJ2-7           10        80
#> 2 CASRPTGTGELFF    TRBV11-3   TRBJ2-2            3       100
#> 3 CASSDLQGEGEQYF   TRBV2-1    TRBJ2-7            2        30
#> 4 CASSLGLAGLYEQYF  TRBV5-1    TRBJ2-7            3         4
#> 5 CASSPTPGQVNYGYTF TRBV4-1    TRBJ1-2            4        10
#> 6 CSARFPSVAVYNEQFF TRBV20     TRBJ2-1            2         2
#> 
#> # Use as.data.table()/as.data.frame()/as_tibble() to access results

# full_join
dt.full <- dplyr::full_join(
  x = dt.CD4, y = dt.CD8,
  by = c("cdr3_amino_acid", "v_resolved", "j_resolved"),
  suffix = c(".CD4", ".CD8")
)

# the custom suffixes ".CD4" and ".CD8" didn't work in full_join
#   it used the default ".x" and ".y" suffixes instead.
dt.full
#> Source: local data table [14 x 5]
#> Call:   merge(`_DT1`, `_DT2`, all = TRUE, by.x = c("cdr3_amino_acid", 
#> "v_resolved", "j_resolved"), by.y = c("cdr3_amino_acid", "v_resolved", 
#> "j_resolved"), allow.cartesian = TRUE)
#> 
#>   cdr3_amino_acid   v_resolved j_resolved reads.x reads.y
#>   <chr>             <chr>      <chr>        <dbl>   <dbl>
#> 1 CASRATSGRAYEQYF   TRBV3      TRBJ2-7         10      80
#> 2 CASRDRGSYGYTF     TRBV2-1    TRBJ1-2          1      NA
#> 3 CASRPTGTGELFF     TRBV11-3   TRBJ2-2          3     100
#> 4 CASSAGTSGGSGDTQYF TRBV9-1    TRBJ2-3          1      NA
#> 5 CASSDLQGEGEQYF    TRBV2-1    TRBJ2-7          2      30
#> 6 CASSEEVWGYTF      TRBV6-1    TRBJ1-2         NA       1
#> # … with 8 more rows
#> # ℹ Use `print(n = ...)` to see more rows
#> 
#> # Use as.data.table()/as.data.frame()/as_tibble() to access results

# downstream code relying on custom suffixes in full_join is now broken.
dplyr::mutate(dt.full, dplyr::across(
  .cols = tidyr::matches("\\.CD4|\\.CD8"),
  .fns = ~ tidyr::replace_na(.x, 0)
))
#> Error in fcoalesce(reads.CD4, 0): object 'reads.CD4' not found

Created on 2022-08-04 by the reprex package (v2.0.1)

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.2.0 (2022-04-22) #> os macOS Big Sur/Monterey 10.16 #> system x86_64, darwin17.0 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz America/New_York #> date 2022-08-04 #> pandoc 2.17.1.1 @ /Applications/RStudio.app/Contents/MacOS/quarto/bin/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> cli 3.3.0 2022-04-25 [1] CRAN (R 4.2.0) #> crayon 1.5.1 2022-03-26 [1] CRAN (R 4.2.0) #> data.table 1.14.3 2022-08-04 [1] local #> digest 0.6.29 2021-12-01 [1] CRAN (R 4.2.0) #> dplyr 1.0.99.9000 2022-08-04 [1] Github (tidyverse/dplyr@0dd7eea) #> dtplyr 1.2.1.9000 2022-08-04 [1] Github (tidyverse/dtplyr@cf7c2d8) #> evaluate 0.15 2022-02-18 [1] CRAN (R 4.2.0) #> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.0) #> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.0) #> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0) #> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0) #> highr 0.9 2021-04-16 [1] CRAN (R 4.2.0) #> htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.0) #> knitr 1.39 2022-04-26 [1] CRAN (R 4.2.0) #> lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.2.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0) #> pillar 1.8.0 2022-07-18 [1] CRAN (R 4.2.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0) #> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.2.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0) #> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0) #> R.utils 2.12.0 2022-06-28 [1] CRAN (R 4.2.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0) #> rematch2 2.1.2 2020-05-01 [1] CRAN (R 4.2.0) #> reprex 2.0.1 2021-08-05 [1] CRAN (R 4.2.0) #> rlang 1.0.4 2022-07-12 [1] CRAN (R 4.2.0) #> rmarkdown 2.14 2022-04-25 [1] CRAN (R 4.2.0) #> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.2.0) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0) #> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.0) #> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.2.0) #> styler 1.7.0 2022-03-13 [1] CRAN (R 4.2.0) #> tibble 3.1.8 2022-07-22 [1] CRAN (R 4.2.0) #> tidyr 1.2.0 2022-02-01 [1] CRAN (R 4.2.0) #> tidyselect 1.1.2.9000 2022-08-04 [1] Github (r-lib/tidyselect@190f80d) #> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.0) #> vctrs 0.4.1.9000 2022-08-04 [1] Github (r-lib/vctrs@fe37f93) #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0) #> xfun 0.31 2022-05-10 [1] CRAN (R 4.2.0) #> yaml 2.3.5 2022-02-21 [1] CRAN (R 4.2.0) #> #> [1] /Users/xxxxxx/Library/R/x86_64/4.2/library #> [2] /Library/Frameworks/R.framework/Versions/4.2/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```
markfairbanks commented 2 years ago

Smaller reprex:

library(dplyr, w = FALSE)
library(dtplyr)

df1 <- lazy_dt(tibble(a = c("a", "b"), b = 1:2))
df2 <- lazy_dt(tibble(a = c("a", "b"), b = 3:4))

df1 %>%
  full_join(df2, by = "a", suffix = c(".one", ".two"))
#> Source: local data table [2 x 3]
#> Call:   merge(`_DT1`, `_DT2`, all = TRUE, by.x = "a", by.y = "a", allow.cartesian = TRUE)
#> 
#>   a       b.x   b.y
#>   <chr> <int> <int>
#> 1 a         1     3
#> 2 b         2     4
#> 
#> # Use as.data.table()/as.data.frame()/as_tibble() to access results
eutwt commented 2 years ago

Thanks for the bug report @caparks2 ! This should work on the GitHub version now

library(dplyr, w = FALSE)
library(dtplyr)

df1 <- lazy_dt(tibble(a = c("a", "b"), b = 1:2))
df2 <- lazy_dt(tibble(a = c("a", "b"), b = 3:4))

df1 %>%
  full_join(df2, by = "a", suffix = c(".one", ".two"))
#> Source: local data table [2 x 3]
#> Call:   merge(`_DT1`, `_DT2`, all = TRUE, by.x = "a", by.y = "a", allow.cartesian = TRUE, 
#>     suffixes = c(".one", ".two"))
#> 
#>   a     b.one b.two
#>   <chr> <int> <int>
#> 1 a         1     3
#> 2 b         2     4
#> 
#> # Use as.data.table()/as.data.frame()/as_tibble() to access results

Created on 2022-08-11 by the reprex package (v2.0.1.9000)