metrumresearchgroup / yspec

Data Specification for Pharmacometrics
https://metrumresearchgroup.github.io/yspec
5 stars 2 forks source link

ys_factors #141

Closed kylebaron closed 1 year ago

kylebaron commented 1 year ago

Summary

Specifics

Example

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(yspec)
options(pillar.width = Inf)

data <- ys_help$data() %>% as_tibble()
spec <- ys_help$spec()

ys_add_factors() adds factor columns

ys_add_factors(data, spec) %>% head(5)
#> # A tibble: 5 × 38
#>   C       NUM    ID  SUBJ  TIME   SEQ   CMT  EVID   AMT    DV   AGE    WT  CRCL
#>   <lgl> <int> <int> <int> <dbl> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 NA        1     1     1  0        0     1     1     5   0    28.0  55.2  114.
#> 2 NA        2     1     1  0.61     1     2     0    NA  61.0  28.0  55.2  114.
#> 3 NA        3     1     1  1.15     1     2     0    NA  91.0  28.0  55.2  114.
#> 4 NA        4     1     1  1.73     1     2     0    NA 122.   28.0  55.2  114.
#> 5 NA        5     1     1  2.15     1     2     0    NA 126.   28.0  55.2  114.
#>     ALB   BMI   AAG   SCR   AST   ALT    HT    CP  TAFD   TAD  LDOS   MDV   BLQ
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <int> <int> <int>
#> 1   4.4  21.7  106.  1.14  11.9  12.7  160.     0  0     0        5     1     0
#> 2   4.4  21.7  106.  1.14  11.9  12.7  160.     0  0.61  0.61     5     0     0
#> 3   4.4  21.7  106.  1.14  11.9  12.7  160.     0  1.15  1.15     5     0     0
#> 4   4.4  21.7  106.  1.14  11.9  12.7  160.     0  1.73  1.73     5     0     0
#> 5   4.4  21.7  106.  1.14  11.9  12.7  160.     0  2.15  2.15     5     0     0
#>   PHASE STUDY RF    C_f   SEQ_f       EVID_f      CP_f   MDV_f       BLQ_f   
#>   <int> <int> <chr> <fct> <fct>       <fct>       <fct>  <fct>       <fct>   
#> 1     1     1 norm  <NA>  observation dose        normal missing     above QL
#> 2     1     1 norm  <NA>  dose        observation normal non-missing above QL
#> 3     1     1 norm  <NA>  dose        observation normal non-missing above QL
#> 4     1     1 norm  <NA>  dose        observation normal non-missing above QL
#> 5     1     1 norm  <NA>  dose        observation normal non-missing above QL
#>   PHASE_f STUDY_f RF_f  
#>   <fct>   <fct>   <fct> 
#> 1 1       SAD     Normal
#> 2 1       SAD     Normal
#> 3 1       SAD     Normal
#> 4 1       SAD     Normal
#> 5 1       SAD     Normal

ys_factors() turns columns to factors and adds columns with original values

ys_factors(data, spec) %>% head(5)
#> # A tibble: 5 × 38
#>   C       NUM    ID  SUBJ  TIME SEQ           CMT EVID          AMT    DV   AGE
#>   <fct> <int> <int> <int> <dbl> <fct>       <int> <fct>       <int> <dbl> <dbl>
#> 1 <NA>      1     1     1  0    observation     1 dose            5   0    28.0
#> 2 <NA>      2     1     1  0.61 dose            2 observation    NA  61.0  28.0
#> 3 <NA>      3     1     1  1.15 dose            2 observation    NA  91.0  28.0
#> 4 <NA>      4     1     1  1.73 dose            2 observation    NA 122.   28.0
#> 5 <NA>      5     1     1  2.15 dose            2 observation    NA 126.   28.0
#>      WT  CRCL   ALB   BMI   AAG   SCR   AST   ALT    HT CP      TAFD   TAD  LDOS
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>  <dbl> <dbl> <int>
#> 1  55.2  114.   4.4  21.7  106.  1.14  11.9  12.7  160. normal  0     0        5
#> 2  55.2  114.   4.4  21.7  106.  1.14  11.9  12.7  160. normal  0.61  0.61     5
#> 3  55.2  114.   4.4  21.7  106.  1.14  11.9  12.7  160. normal  1.15  1.15     5
#> 4  55.2  114.   4.4  21.7  106.  1.14  11.9  12.7  160. normal  1.73  1.73     5
#> 5  55.2  114.   4.4  21.7  106.  1.14  11.9  12.7  160. normal  2.15  2.15     5
#>   MDV         BLQ      PHASE STUDY RF     C_v   SEQ_v EVID_v  CP_v MDV_v BLQ_v
#>   <fct>       <fct>    <fct> <fct> <fct>  <lgl> <int>  <int> <int> <int> <int>
#> 1 missing     above QL 1     SAD   Normal NA        0      1     0     1     0
#> 2 non-missing above QL 1     SAD   Normal NA        1      0     0     0     0
#> 3 non-missing above QL 1     SAD   Normal NA        1      0     0     0     0
#> 4 non-missing above QL 1     SAD   Normal NA        1      0     0     0     0
#> 5 non-missing above QL 1     SAD   Normal NA        1      0     0     0     0
#>   PHASE_v STUDY_v RF_v 
#>     <int>   <int> <chr>
#> 1       1       1 norm 
#> 2       1       1 norm 
#> 3       1       1 norm 
#> 4       1       1 norm 
#> 5       1       1 norm
ys_factors(data, spec) %>% 
  as_tibble() %>% 
  count(EVID, EVID_v, CP, CP_v)
#> # A tibble: 8 × 5
#>   EVID        EVID_v CP      CP_v     n
#>   <fct>        <int> <fct>  <int> <int>
#> 1 observation      0 normal     0  2760
#> 2 observation      0 Pugh1      1   150
#> 3 observation      0 Pugh2      2   150
#> 4 observation      0 Pugh3      3   150
#> 5 dose             1 normal     0  1120
#> 6 dose             1 Pugh1      1    10
#> 7 dose             1 Pugh2      2    10
#> 8 dose             1 Pugh3      3    10

Created on 2023-07-18 with reprex v2.0.2

andersone1 commented 1 year ago

Hey @kylebaron,

I plan to come to this review soon - just knocking out some client work this week.

andersone1 commented 1 year ago

@kylebaron

The code looks good - I have a comment about a particular way this may get used, specifically:

ys_factors(data, spec, EVID, .keep_values = TRUE, .suffix = "")

The key part is .keep_values = TRUE, .suffix = ""

Do we want to disallow this option (or potentially set .suffix to NULL under the hood if its equal to "" & .keep_values = TRUE)?

Leaving as is maybe OK too.

kylebaron commented 1 year ago

@andersone1 - good question; let me confirm what the behavior is in this case.

kylebaron commented 1 year ago

@andersone1 - you are right; the factors were getting overwritten; I updated the logic in the code and added a test for this.