OHDSI / CohortGenerator

An R package for instantiating cohorts using data in the CDM.
https://ohdsi.github.io/CohortGenerator/
11 stars 10 forks source link

Expose col_types from readr #160

Closed anthonysena closed 1 month ago

anthonysena commented 1 month ago

Allows for explicit data type to column mapping to avoid issues as described in #59. Here is a reprex for the code in this branch:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df <- data.frame(database_id = "optum",
                 cohort_id = 27,
                 rule_sequence = 0,
                 name = "No COPD piror",
                 description = "")
df <- rbind(df, data.frame(database_id = "optum",
                           cohort_id = 28,
                           rule_sequence = 0,
                           name = "No piror asthma",
                           description = ""))
df
#>   database_id cohort_id rule_sequence            name description
#> 1       optum        27             0   No COPD piror            
#> 2       optum        28             0 No piror asthma

tf <- tempfile(fileext = ".csv")
CohortGenerator::saveIncremental(data = df,
                                 fileName = tf,
                                 database_id = df$database_id,
                                 cohort_id = df$cohort_id)

# readr guesses decription is logical
df2 <- CohortGenerator::readCsv(file = tf)
df2
#> # A tibble: 2 x 5
#>   databaseId cohortId ruleSequence name            description
#>   <chr>         <dbl>        <dbl> <chr>           <lgl>      
#> 1 optum            27            0 No COPD piror   NA         
#> 2 optum            28            0 No piror asthma NA

# explictly declare the column types
df3 <- CohortGenerator::readCsv(
  file = tf,
  colTypes = list('c', 'd', 'd', 'c', 'c')
)
df3
#> # A tibble: 2 x 5
#>   databaseId cohortId ruleSequence name            description
#>   <chr>         <dbl>        <dbl> <chr>           <chr>      
#> 1 optum            27            0 No COPD piror   <NA>       
#> 2 optum            28            0 No piror asthma <NA>

# do it again for testing join conditions
df4 <- CohortGenerator::readCsv(
  file = tf,
  colTypes = list('c', 'd', 'd', 'c', 'c')
)

# test joining data frames with the same description data type
df5 <- df3 %>% inner_join(df4)
#> Joining with `by = join_by(databaseId, cohortId, ruleSequence, name,
#> description)`
df5
#> # A tibble: 2 x 5
#>   databaseId cohortId ruleSequence name            description
#>   <chr>         <dbl>        <dbl> <chr>           <chr>      
#> 1 optum            27            0 No COPD piror   <NA>       
#> 2 optum            28            0 No piror asthma <NA>

# test joining data frames with different description data type
df6 <- df3 %>% inner_join(df2)
#> Joining with `by = join_by(databaseId, cohortId, ruleSequence, name,
#> description)`
df6
#> # A tibble: 2 x 5
#>   databaseId cohortId ruleSequence name            description
#>   <chr>         <dbl>        <dbl> <chr>           <chr>      
#> 1 optum            27            0 No COPD piror   <NA>       
#> 2 optum            28            0 No piror asthma <NA>

# test joining data frames with different description data type (order doesn't matter)
df7 <- df2 %>% inner_join(df3)
#> Joining with `by = join_by(databaseId, cohortId, ruleSequence, name,
#> description)`
df7
#> # A tibble: 2 x 5
#>   databaseId cohortId ruleSequence name            description
#>   <chr>         <dbl>        <dbl> <chr>           <chr>      
#> 1 optum            27            0 No COPD piror   <NA>       
#> 2 optum            28            0 No piror asthma <NA>

Created on 2024-06-05 with reprex v2.1.0