pharmaverse / admiral

ADaM in R Asset Library
https://pharmaverse.github.io/admiral
Apache License 2.0
224 stars 64 forks source link

Union_all creates duplicates in ad_advs template #1363

Closed morrishj closed 2 years ago

morrishj commented 2 years ago

What happened?

_unionall(advs) in advs template (line 261) appears to be creating duplicates of any rows where EOTFL = "Y" by joining the filtered rows [line 263: filter(EOTFL == "Y") ] back to the advs dataset. Adding _arrange(., EOTFL) %>% distinct(., USUBJID, VSSEQ, VISITNUM, .keepall = TRUE) after union_all resolves this.

Session Information

sessionInfo() R version 4.1.3 (2022-03-10) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.4 LTS

Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] rice_3.0 shiny_1.7.1 viewenhance_0.1.0 devtools_2.4.3 usethis_2.1.6 diffdf_1.0.4
[7] stringr_1.4.0 lubridate_1.8.0 admiral_0.7.0 dplyr_1.0.9

loaded via a namespace (and not attached): [1] bslib_0.3.1 tidyselect_1.1.2 remotes_2.4.2 purrr_0.3.4 haven_2.5.0 vctrs_0.4.1
[7] generics_0.1.2 miniUI_0.1.1.1 htmltools_0.5.2 utf8_1.2.2 rlang_1.0.3 pkgbuild_1.3.1
[13] jquerylib_0.1.4 pillar_1.7.0 later_1.3.0 glue_1.6.2 DBI_1.1.3 sessioninfo_1.2.2 [19] lifecycle_1.0.1 fontawesome_0.2.2 memoise_2.0.1 forcats_0.5.1 tzdb_0.3.0 callr_3.7.0
[25] fastmap_1.1.0 httpuv_1.6.5 ps_1.7.1 curl_4.3.2 fansi_1.0.3 Rcpp_1.0.8.3
[31] readr_2.1.2 xtable_1.8-4 promises_1.2.0.1 cachem_1.0.6 pkgload_1.3.0 jsonlite_1.8.0
[37] mime_0.12 fs_1.5.2 hms_1.1.1 digest_0.6.29 stringi_1.7.6 processx_3.6.1
[43] getPass_0.2-2 cli_3.3.0 tools_4.1.3 sass_0.4.1 magrittr_2.0.3 tibble_3.1.7
[49] crayon_1.5.1 tidyr_1.2.0 pkgconfig_2.0.3 ellipsis_0.3.2 prettyunits_1.1.1 assertthat_0.2.1 [55] httr_1.4.3 rstudioapi_0.13 R6_2.5.1 compiler_4.1.3

Reproducible Example

---- Lookup tables ----

Assign PARAMCD, PARAM, and PARAMN

param_lookup <- tibble::tribble( ~VSTESTCD, ~PARAMCD, ~PARAM, ~PARAMN, "SYSBP", "SYSBP", "Systolic Blood Pressure (mmHg)", 1, "DIABP", "DIABP", "Diastolic Blood Pressure (mmHg)", 2, "PULSE", "PUL", "Pulse Rate (beats/min)", 3, "WEIGHT", "WGHT", "Weight (kg)", 4, "HEIGHT", "HGHT", "Height (cm)", 5, "TEMP", "TEMP", "Temperature (C)", 6, "RESP", "RESP", "Respiratory Rate (breaths/min)", 7, "OXYSAT", "OXYSAT", "Oxygen Saturation (%)", 8 ) attr(param_lookup$VSTESTCD, "label") <- "Vital Signs Test Short Name"

Assign ANRLO/HI, A1LO/HI

range_lookup <- tibble::tribble( ~PARAMCD, ~ANRLO, ~ANRHI, ~A1LO, ~A1HI, "SYSBP", 90, 130, 70, 140, "DIABP", 60, 80, 40, 90, "PUL", 60, 100, 40, 110, "TEMP", 36.5, 37.5, 35, 38, "RESP", 8, 20, NA, NA )

ASSIGN AVALCAT1

avalcat_lookup <- tibble::tribble( ~PARAMCD, ~AVALCA1N, ~AVALCAT1, "HGHT", 1, ">100 cm", "HGHT", 2, "<= 100 cm" )

---- User defined functions ----

Here are some examples of how you can create your own functions that

operates on vectors, which can be used in mutate().

format_avalcat1n <- function(param, aval) { case_when( param == "HGHT" & aval > 140 ~ 1, param == "HGHT" & aval <= 140 ~ 2 ) }

---- Derivations ----

Get list of ADSL vars required for derivations

adsl_vars <- vars(TRTSDT, TRTEDT, TRT01A, TRT01P)

advs <- vs %>%

Join ADSL with VS (need TRTSDT for ADY derivation)

derive_vars_merged( dataset_add = adsl, new_vars = adsl_vars, by_vars = vars(STUDYID, USUBJID) ) %>%

Calculate ADT, ADY

derive_vars_dt( new_vars_prefix = "A", dtc = VSDTC, flag_imputation = "none" ) %>%

calculate astdy and aendy

derive_vars_dy(reference_date = TRTSDT, source_vars = vars(ADT)) %>%

calculate adtm

derive_vars_dtm(new_vars_prefix = "A", dtc=VSDTC, flag_imputation="both")

advs <- advs %>%

Add PARAMCD only - add PARAM etc later

derive_vars_merged( dataset_add = param_lookup, new_vars = vars(PARAMCD), by_vars = vars(VSTESTCD) ) %>%

Calculate AVAL and AVALC

mutate( AVAL = VSSTRESN, AVALC = VSSTRESC, AVALU = VSSTRESU )

Derive new parameters based on existing records. Note that, for the following

three derive_param_*() functions, only the variables specified in by_vars will

be populated in the newly created records.

Derive Mean Arterial Pressure

derive_param_map(

by_vars = vars(STUDYID, USUBJID, !!!adsl_vars, VISIT, VISITNUM, ADT, ADY, VSTPT, VSTPTNUM),

set_values_to = vars(PARAMCD = "MAP"),

get_unit_expr = VSSTRESU,

filter = VSSTAT != "NOT DONE" | is.na(VSSTAT)

) %>%

Derive Body Surface Area

derive_param_bsa(

by_vars = vars(STUDYID, USUBJID, !!!adsl_vars, VISIT, VISITNUM, ADT, ADY, VSTPT, VSTPTNUM),

method = "Mosteller",

set_values_to = vars(PARAMCD = "BSA"),

get_unit_expr = VSSTRESU,

filter = VSSTAT != "NOT DONE" | is.na(VSSTAT)

) %>%

Derive Body Surface Area

derive_param_bmi(

by_vars = vars(STUDYID, USUBJID, !!!adsl_vars, VISIT, VISITNUM, ADT, ADY, VSTPT, VSTPTNUM),

set_values_to = vars(PARAMCD = "BMI"),

get_unit_expr = VSSTRESU,

filter = VSSTAT != "NOT DONE" | is.na(VSSTAT)

)

get visit info

advs <- advs %>%

Derive Timing

mutate( AVISIT = ifelse(!is.na(ADTM) & ADTM<= TRTSDT, "BASELINE", VISIT) )

advs <- advs %>%

Derive Timing

mutate( AVISIT = ifelse(AVISIT != "BASELINE", ifelse(str_detect(VISIT,"(?i)Unscheduled"), "Unscheduled", AVISIT), AVISIT) )

advs <- advs %>%

Derive Timing

mutate( AVISITN = ifelse(AVISIT == "BASELINE", 0, VISITNUM) )

Derive a new record as a summary record (e.g. mean of the triplicates at each time point)

commented out as we don't use triplicates

advs <- advs %>%

derive_summary_records(

by_vars = vars(STUDYID, USUBJID, !!!adsl_vars, PARAMCD, AVISITN, AVISIT, ADT, ADY),

filter = !is.na(AVAL),

analysis_var = AVAL,

summary_fun = mean,

set_values_to = vars(DTYPE = "AVERAGE")

)

advs <- advs %>%

Calculate ONTRTFL

derive_var_ontrtfl( start_date = ADT, ref_start_date = TRTSDT, ref_end_date = TRTEDT, filter_pre_timepoint = AVISIT == "BASELINE" )

Calculate ANRIND : requires the reference ranges ANRLO, ANRHI

Also accommodates the ranges A1LO, A1HI

advs <- advs %>% derive_vars_merged(dataset_add = range_lookup, by_vars = vars(PARAMCD)) %>%

Calculate ANRIND

derive_var_anrind()

Derive baseline flags

advs <- advs %>%

ADVS dataset on eICE has BASETYPE set to LAST Period 01

Calculate BASETYPE

derive_var_basetype( basetypes = rlang::exprs( "LAST" = !is.na(AVISIT), "NA" = is.na(AVISIT) ) ) %>%

Calculate ABLFL

restrict_derivation( derivation = derive_var_extreme_flag, args = params( by_vars = vars(STUDYID, USUBJID, PARAMCD), order = vars(ADTM, VSSPID, VSSEQ), new_var = ABLFL, mode = "last" ), filter = (!is.na(AVAL) & ADT <= TRTSDT & !is.na(BASETYPE))

DAPM3 for ABLFL says "...within the baseline visit window ADVS.AVISIT = 'BASELINE'] "

)

Derive baseline information

advs <- advs %>%

Calculate BASE

derive_var_base( by_vars = vars(STUDYID, USUBJID, PARAMCD, BASETYPE), source_var = AVAL, new_var = BASE ) %>%

Calculate BASEC

derive_var_base( by_vars = vars(STUDYID, USUBJID, PARAMCD, BASETYPE), source_var = AVALC, new_var = BASEC ) %>%

Calculate BNRIND

derive_var_base( by_vars = vars(STUDYID, USUBJID, PARAMCD, BASETYPE), source_var = ANRIND, new_var = BNRIND ) %>%

Calculate CHG

derive_var_chg() %>%

Calculate PCHG

derive_var_pchg()

ANL01FL: Flag last result within an AVISIT and ATPT for post-baseline records

advs <- advs %>% restrict_derivation( derivation = derive_var_extreme_flag, args = params( new_var = ANL01FL, by_vars = vars(USUBJID, PARAMCD, AVISIT), order = vars(ADT, AVAL, VSSEQ), mode = "last" ), filter = !is.na(AVISITN) & ONTRTFL == "Y" )

Get treatment information

advs <- advs %>%

Create End of Treatment Record

restrict_derivation( derivation = derive_var_extreme_flag, args = params( by_vars = vars(STUDYID, USUBJID, PARAMCD), order = vars(ADT, VSSEQ), new_var = EOTFL, mode = "last" ), filter = (ADT > TRTSDT & ADT <= TRTEDT) ) %>% filter(EOTFL == "Y") %>% mutate( AVISIT = "End of Treatment", AVISITN = 99 ) %>% union_all(advs) %>% arrange(., EOTFL) %>% distinct(., USUBJID, VSSEQ, VISITNUM, .keep_all = TRUE) %>% select(-EOTFL) %>%

Assign TRTA, TRTP

mutate( TRTP = TRT01P, TRTA = TRT01A )

thomas-neitmann commented 2 years ago

@morrishj Could you add a fix for this to your open PR #1362? WHile your solution using distinct() certainly does the job I wonder if we should just use union_all(filter(advs, EOTFL != "Y" | is.na(EOTFL)) instead to be more explicit of what's actually going on.

thomas-neitmann commented 2 years ago

As mentioned by @oreillb1 this behavior of duplicating records for the end of treatment visit is intentional.