Open njtierney opened 1 year ago
As discussed with @dicook , these helpers could be used to identify which rows and variables need imputation/dropping/exploring.
Perhaps something like this, with an example workflow
library(tidyverse) # which row ID has > X% missing # which variables have > x% missing which_prop_miss_row <- function(data, prop){ naniar::prop_miss_row(data) > prop } prop_miss_cols <- function(data){ colMeans(is.na(data)) } which_prop_miss_var <- function(data, prop){ prop_miss_cols(data) > prop } which_prop_miss_row(airquality, 0.1) #> [1] FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE #> [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE #> [25] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE #> [37] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE #> [49] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #> [61] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE #> [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE #> [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE #> [97] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE #> [109] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE #> [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE #> [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE #> [145] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE which_prop_miss_var(airquality, 0.1) #> Ozone Solar.R Wind Temp Month Day #> TRUE FALSE FALSE FALSE FALSE FALSE nrow(airquality) #> [1] 153 airquality %>% filter( which_prop_miss_row(., 0.1) ) %>% nrow() #> [1] 42 vars_over_10_pct <- which(which_prop_miss_var(airquality, 0.1)) vars_under_10_pct <- which(!which_prop_miss_var(airquality, 0.1)) airquality %>% select( all_of(vars_over_10_pct) ) %>% as_tibble() #> # A tibble: 153 × 1 #> Ozone #> <int> #> 1 41 #> 2 36 #> 3 12 #> 4 18 #> 5 NA #> 6 28 #> 7 23 #> 8 19 #> 9 8 #> 10 NA #> # … with 143 more rows airquality %>% select( all_of(vars_under_10_pct) ) %>% as_tibble() #> # A tibble: 153 × 5 #> Solar.R Wind Temp Month Day #> <int> <dbl> <int> <int> <int> #> 1 190 7.4 67 5 1 #> 2 118 8 72 5 2 #> 3 149 12.6 74 5 3 #> 4 313 11.5 62 5 4 #> 5 NA 14.3 56 5 5 #> 6 NA 14.9 66 5 6 #> 7 299 8.6 65 5 7 #> 8 99 13.8 59 5 8 #> 9 19 20.1 61 5 9 #> 10 194 8.6 69 5 10 #> # … with 143 more rows
Created on 2023-04-06 with reprex v2.0.2
There could also be equivalent which_n_miss_row/var functions
which_n_miss_row/var
As discussed with @dicook , these helpers could be used to identify which rows and variables need imputation/dropping/exploring.
Perhaps something like this, with an example workflow
Created on 2023-04-06 with reprex v2.0.2
Session info
``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.2.3 (2023-03-15) #> os macOS Ventura 13.2 #> system aarch64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz Australia/Hobart #> date 2023-04-06 #> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.0) #> backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0) #> broom 1.0.3 2023-01-25 [1] CRAN (R 4.2.0) #> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.2.0) #> cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0) #> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0) #> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.0) #> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0) #> dbplyr 2.3.0 2023-01-16 [1] CRAN (R 4.2.0) #> digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.0) #> dplyr * 1.1.0 2023-01-29 [1] CRAN (R 4.2.1) #> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0) #> evaluate 0.20 2023-01-17 [1] CRAN (R 4.2.0) #> fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.0) #> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.0) #> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.2.0) #> fs 1.6.1 2023-02-06 [1] CRAN (R 4.2.0) #> gargle 1.3.0 2023-01-30 [1] CRAN (R 4.2.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0) #> ggplot2 * 3.4.1 2023-02-10 [1] CRAN (R 4.2.0) #> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0) #> googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.2.0) #> googlesheets4 1.0.1 2022-08-13 [1] CRAN (R 4.2.0) #> gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.0) #> haven 2.5.1 2022-08-22 [1] CRAN (R 4.2.0) #> hms 1.1.2 2022-08-19 [1] CRAN (R 4.2.0) #> htmltools 0.5.4 2022-12-07 [1] CRAN (R 4.2.0) #> httr 1.4.4 2022-08-17 [1] CRAN (R 4.2.0) #> jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.2.0) #> knitr 1.42 2023-01-25 [1] CRAN (R 4.2.0) #> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0) #> lubridate 1.9.1 2023-01-24 [1] CRAN (R 4.2.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0) #> modelr 0.1.10 2022-11-11 [1] CRAN (R 4.2.0) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0) #> naniar 1.0.0 2023-02-02 [1] CRAN (R 4.2.0) #> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0) #> purrr * 1.0.1 2023-01-10 [1] CRAN (R 4.2.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0) #> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0) #> R.utils 2.12.2 2022-11-11 [1] CRAN (R 4.2.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0) #> readr * 2.1.3 2022-10-01 [1] CRAN (R 4.2.0) #> readxl 1.4.1 2022-08-17 [1] CRAN (R 4.2.0) #> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0) #> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0) #> rmarkdown 2.20 2023-01-19 [1] CRAN (R 4.2.0) #> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.0) #> rvest 1.0.3 2022-08-19 [1] CRAN (R 4.2.0) #> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0) #> stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0) #> stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.2.0) #> styler 1.9.0 2023-01-15 [1] CRAN (R 4.2.0) #> tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.2.0) #> tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.2.0) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0) #> tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.2.0) #> timechange 0.2.0 2023-01-11 [1] CRAN (R 4.2.0) #> tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.0) #> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0) #> vctrs 0.5.2 2023-01-23 [1] CRAN (R 4.2.0) #> visdat 0.6.0 2023-02-02 [1] local #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0) #> xfun 0.37 2023-01-31 [1] CRAN (R 4.2.0) #> xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.0) #> yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0) #> #> [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```There could also be equivalent
which_n_miss_row/var
functions