njtierney / naniar

Tidy data structures, summaries, and visualisations for missing data
http://naniar.njtierney.com/
Other
652 stars 53 forks source link

Update imputers and provide documentation for impute functions #324

Closed njtierney closed 1 year ago

njtierney commented 1 year ago

Description

This provides updates to the impute functions by providing details of using across with these functions.

This also involved deprecating shadow shift in favour of impute_below and changing various part opf the code base for that.

Related Issue

resolves #262 resolves #193

Example

Just providing more detail for the impute functions and how to use them with across, e.g.,

library(naniar)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
vec_fct <- factor(LETTERS[1:10])
vec_fct[sample(1:10, 3)] <- NA
vec_fct
#>  [1] A    B    C    <NA> E    F    G    <NA> I    <NA>
#> Levels: A B C D E F G H I J
impute_factor(vec_fct, "wat")
#>  [1] A   B   C   wat E   F   G   wat I   wat
#> Levels: A B C D E F G H I J wat
vec <- rnorm(10)
vec[sample(1:10, 3)] <- NA
vec
#>  [1]  0.5629153         NA -0.0867603  0.3899923  2.0253902  0.8396355
#>  [7]  0.4184700  0.3804955         NA         NA
impute_below(vec)
#>  [1]  0.5629153 -0.2858669 -0.0867603  0.3899923  2.0253902  0.8396355
#>  [7]  0.4184700  0.3804955 -0.3127560 -0.3335501
impute_fixed(vec, -99)
#>  [1]   0.5629153 -99.0000000  -0.0867603   0.3899923   2.0253902   0.8396355
#>  [7]   0.4184700   0.3804955 -99.0000000 -99.0000000
impute_mean(vec)
#>  [1]  0.5629153  0.6471627 -0.0867603  0.3899923  2.0253902  0.8396355
#>  [7]  0.4184700  0.3804955  0.6471627  0.6471627
impute_median(vec)
#>  [1]  0.5629153  0.4184700 -0.0867603  0.3899923  2.0253902  0.8396355
#>  [7]  0.4184700  0.3804955  0.4184700  0.4184700
impute_mode(vec)
#>  [1]  0.5629153  0.4307643 -0.0867603  0.3899923  2.0253902  0.8396355
#>  [7]  0.4184700  0.3804955  0.4307643  0.4307643
impute_mode(vec_fct)
#>  [1] A B C A E F G A I A
#> Levels: A B C D E F G H I J
impute_zero(vec)
#>  [1]  0.5629153  0.0000000 -0.0867603  0.3899923  2.0253902  0.8396355
#>  [7]  0.4184700  0.3804955  0.0000000  0.0000000

dat <- tibble(
  num = rnorm(10),
  int = rpois(10, 5),
  unif = runif(10),
  small = rpois(10, 8) / 1000,
  large = rpois(10, 1e6),
  neg = -rpois(10, 10),
  geom = rgeom(10, 0.1),
  fct = factor(LETTERS[1:10])
) %>%
  mutate(
    across(
      everything(),
      \(x) set_prop_miss(x, prop = 0.25)
    )
  )

dat
#> # A tibble: 10 × 8
#>        num   int    unif  small   large   neg  geom fct  
#>      <dbl> <int>   <dbl>  <dbl>   <int> <int> <int> <fct>
#>  1  0.0791     3  0.750   0.01       NA   -10     2 A    
#>  2  0.295      7  0.0258 NA          NA    -8     2 B    
#>  3  1.10       5  0.406   0.012  999918   -13    18 C    
#>  4 -1.57       1  0.677   0.01  1000710    NA     0 D    
#>  5 -0.324     NA  0.116   0.007  999753   -13     3 E    
#>  6 NA          1 NA       0.007  999087    -6    NA F    
#>  7 -1.62       3  0.262   0.008 1000407   -21    12 G    
#>  8 -1.50      NA NA       0.004 1001663    NA    NA H    
#>  9 -0.721      7  0.323   0.008  999725    -4     1 <NA> 
#> 10 NA          6  0.375  NA     1001373   -13     1 <NA>

dat %>%
  nabular() %>%
  mutate(
    num = impute_fixed(num, -9999),
    int = impute_zero(int),
    fct = impute_factor(fct, "out")
  )
#> # A tibble: 10 × 16
#>         num   int    unif  small   large   neg  geom fct   num_NA int_NA unif_NA
#>       <dbl> <dbl>   <dbl>  <dbl>   <int> <int> <int> <fct> <fct>  <fct>  <fct>  
#>  1  7.91e-2     3  0.750   0.01       NA   -10     2 A     !NA    !NA    !NA    
#>  2  2.95e-1     7  0.0258 NA          NA    -8     2 B     !NA    !NA    !NA    
#>  3  1.10e+0     5  0.406   0.012  999918   -13    18 C     !NA    !NA    !NA    
#>  4 -1.57e+0     1  0.677   0.01  1000710    NA     0 D     !NA    !NA    !NA    
#>  5 -3.24e-1     0  0.116   0.007  999753   -13     3 E     !NA    NA     !NA    
#>  6 -1.00e+4     1 NA       0.007  999087    -6    NA F     NA     !NA    NA     
#>  7 -1.62e+0     3  0.262   0.008 1000407   -21    12 G     !NA    !NA    !NA    
#>  8 -1.50e+0     0 NA       0.004 1001663    NA    NA H     !NA    NA     NA     
#>  9 -7.21e-1     7  0.323   0.008  999725    -4     1 out   !NA    !NA    !NA    
#> 10 -1.00e+4     6  0.375  NA     1001373   -13     1 out   NA     !NA    !NA    
#> # ℹ 5 more variables: small_NA <fct>, large_NA <fct>, neg_NA <fct>,
#> #   geom_NA <fct>, fct_NA <fct>

dat %>%
  mutate(
    across(
      c("num", "int", "unif"),
      \(x) impute_mean(x)
    )
  )
#> # A tibble: 10 × 8
#>        num   int   unif  small   large   neg  geom fct  
#>      <dbl> <dbl>  <dbl>  <dbl>   <int> <int> <int> <fct>
#>  1  0.0791  3    0.750   0.01       NA   -10     2 A    
#>  2  0.295   7    0.0258 NA          NA    -8     2 B    
#>  3  1.10    5    0.406   0.012  999918   -13    18 C    
#>  4 -1.57    1    0.677   0.01  1000710    NA     0 D    
#>  5 -0.324   4.12 0.116   0.007  999753   -13     3 E    
#>  6 -0.533   1    0.367   0.007  999087    -6    NA F    
#>  7 -1.62    3    0.262   0.008 1000407   -21    12 G    
#>  8 -1.50    4.12 0.367   0.004 1001663    NA    NA H    
#>  9 -0.721   7    0.323   0.008  999725    -4     1 <NA> 
#> 10 -0.533   6    0.375  NA     1001373   -13     1 <NA>

dat %>%
  mutate(
    across(
      where(is.numeric),
      \(x) impute_mean(x)
    )
  )
#> # A tibble: 10 × 8
#>        num   int   unif   small    large   neg  geom fct  
#>      <dbl> <dbl>  <dbl>   <dbl>    <dbl> <dbl> <dbl> <fct>
#>  1  0.0791  3    0.750  0.01    1000330.   -10  2    A    
#>  2  0.295   7    0.0258 0.00825 1000330.    -8  2    B    
#>  3  1.10    5    0.406  0.012    999918    -13 18    C    
#>  4 -1.57    1    0.677  0.01    1000710    -11  0    D    
#>  5 -0.324   4.12 0.116  0.007    999753    -13  3    E    
#>  6 -0.533   1    0.367  0.007    999087     -6  4.88 F    
#>  7 -1.62    3    0.262  0.008   1000407    -21 12    G    
#>  8 -1.50    4.12 0.367  0.004   1001663    -11  4.88 H    
#>  9 -0.721   7    0.323  0.008    999725     -4  1    <NA> 
#> 10 -0.533   6    0.375  0.00825 1001373    -13  1    <NA>

Created on 2023-04-13 with reprex v2.0.2

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.2.3 (2023-03-15) #> os macOS Ventura 13.2 #> system aarch64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz Australia/Sydney #> date 2023-04-13 #> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> cli 3.6.1 2023-03-23 [1] CRAN (R 4.2.0) #> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0) #> digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.0) #> dplyr * 1.1.1 2023-03-22 [1] CRAN (R 4.2.0) #> evaluate 0.20 2023-01-17 [1] CRAN (R 4.2.0) #> fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.0) #> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.0) #> forcats 1.0.0 2023-01-29 [1] CRAN (R 4.2.0) #> fs 1.6.1 2023-02-06 [1] CRAN (R 4.2.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0) #> ggplot2 3.4.2 2023-04-03 [1] CRAN (R 4.2.0) #> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0) #> gtable 0.3.3 2023-03-21 [1] CRAN (R 4.2.0) #> htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.2.0) #> knitr 1.42 2023-01-25 [1] CRAN (R 4.2.0) #> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0) #> naniar * 1.0.0.9000 2023-04-10 [1] local #> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0) #> purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0) #> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0) #> R.utils 2.12.2 2022-11-11 [1] CRAN (R 4.2.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0) #> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0) #> rlang 1.1.0 2023-03-14 [1] CRAN (R 4.2.0) #> rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.2.0) #> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.0) #> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0) #> styler 1.9.1 2023-03-04 [1] CRAN (R 4.2.0) #> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.2.0) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0) #> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0) #> vctrs 0.6.1 2023-03-22 [1] CRAN (R 4.2.0) #> visdat 0.6.0 2023-02-02 [1] local #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0) #> xfun 0.38 2023-03-24 [1] CRAN (R 4.2.0) #> yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0) #> #> [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```

Tests

Yes

NEWS + DESCRIPTION

Yes