Allows for explicit data type to column mapping to avoid issues as described in #59. Here is a reprex for the code in this branch:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- data.frame(database_id = "optum",
cohort_id = 27,
rule_sequence = 0,
name = "No COPD piror",
description = "")
df <- rbind(df, data.frame(database_id = "optum",
cohort_id = 28,
rule_sequence = 0,
name = "No piror asthma",
description = ""))
df
#> database_id cohort_id rule_sequence name description
#> 1 optum 27 0 No COPD piror
#> 2 optum 28 0 No piror asthma
tf <- tempfile(fileext = ".csv")
CohortGenerator::saveIncremental(data = df,
fileName = tf,
database_id = df$database_id,
cohort_id = df$cohort_id)
# readr guesses decription is logical
df2 <- CohortGenerator::readCsv(file = tf)
df2
#> # A tibble: 2 x 5
#> databaseId cohortId ruleSequence name description
#> <chr> <dbl> <dbl> <chr> <lgl>
#> 1 optum 27 0 No COPD piror NA
#> 2 optum 28 0 No piror asthma NA
# explictly declare the column types
df3 <- CohortGenerator::readCsv(
file = tf,
colTypes = list('c', 'd', 'd', 'c', 'c')
)
df3
#> # A tibble: 2 x 5
#> databaseId cohortId ruleSequence name description
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 optum 27 0 No COPD piror <NA>
#> 2 optum 28 0 No piror asthma <NA>
# do it again for testing join conditions
df4 <- CohortGenerator::readCsv(
file = tf,
colTypes = list('c', 'd', 'd', 'c', 'c')
)
# test joining data frames with the same description data type
df5 <- df3 %>% inner_join(df4)
#> Joining with `by = join_by(databaseId, cohortId, ruleSequence, name,
#> description)`
df5
#> # A tibble: 2 x 5
#> databaseId cohortId ruleSequence name description
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 optum 27 0 No COPD piror <NA>
#> 2 optum 28 0 No piror asthma <NA>
# test joining data frames with different description data type
df6 <- df3 %>% inner_join(df2)
#> Joining with `by = join_by(databaseId, cohortId, ruleSequence, name,
#> description)`
df6
#> # A tibble: 2 x 5
#> databaseId cohortId ruleSequence name description
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 optum 27 0 No COPD piror <NA>
#> 2 optum 28 0 No piror asthma <NA>
# test joining data frames with different description data type (order doesn't matter)
df7 <- df2 %>% inner_join(df3)
#> Joining with `by = join_by(databaseId, cohortId, ruleSequence, name,
#> description)`
df7
#> # A tibble: 2 x 5
#> databaseId cohortId ruleSequence name description
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 optum 27 0 No COPD piror <NA>
#> 2 optum 28 0 No piror asthma <NA>
Allows for explicit data type to column mapping to avoid issues as described in #59. Here is a reprex for the code in this branch:
Created on 2024-06-05 with reprex v2.1.0