njtierney / naniar

Tidy data structures, summaries, and visualisations for missing data
http://naniar.njtierney.com/
Other
649 stars 54 forks source link

Helpers to identify which rows and variables have over a certain % or number of missingns #317

Open njtierney opened 1 year ago

njtierney commented 1 year ago

As discussed with @dicook , these helpers could be used to identify which rows and variables need imputation/dropping/exploring.

Perhaps something like this, with an example workflow

library(tidyverse)
# which row ID has > X% missing
# which variables have > x% missing

which_prop_miss_row <- function(data, prop){
  naniar::prop_miss_row(data) > prop
}

prop_miss_cols <- function(data){
  colMeans(is.na(data))
}

which_prop_miss_var <- function(data, prop){
  prop_miss_cols(data) > prop
}

which_prop_miss_row(airquality, 0.1)
#>   [1] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
#>  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [25]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
#>  [37]  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
#>  [49] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#>  [61]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
#>  [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
#>  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
#>  [97]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE
#> [109] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
#> [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [145] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
which_prop_miss_var(airquality, 0.1)
#>   Ozone Solar.R    Wind    Temp   Month     Day 
#>    TRUE   FALSE   FALSE   FALSE   FALSE   FALSE

nrow(airquality)
#> [1] 153

airquality %>% 
  filter(
    which_prop_miss_row(., 0.1)
  ) %>% 
  nrow()
#> [1] 42

vars_over_10_pct <- which(which_prop_miss_var(airquality, 0.1))
vars_under_10_pct <- which(!which_prop_miss_var(airquality, 0.1))
airquality %>% 
  select(
    all_of(vars_over_10_pct)
  ) %>% 
  as_tibble()
#> # A tibble: 153 × 1
#>    Ozone
#>    <int>
#>  1    41
#>  2    36
#>  3    12
#>  4    18
#>  5    NA
#>  6    28
#>  7    23
#>  8    19
#>  9     8
#> 10    NA
#> # … with 143 more rows

airquality %>% 
  select(
    all_of(vars_under_10_pct)
  ) %>% 
  as_tibble()
#> # A tibble: 153 × 5
#>    Solar.R  Wind  Temp Month   Day
#>      <int> <dbl> <int> <int> <int>
#>  1     190   7.4    67     5     1
#>  2     118   8      72     5     2
#>  3     149  12.6    74     5     3
#>  4     313  11.5    62     5     4
#>  5      NA  14.3    56     5     5
#>  6      NA  14.9    66     5     6
#>  7     299   8.6    65     5     7
#>  8      99  13.8    59     5     8
#>  9      19  20.1    61     5     9
#> 10     194   8.6    69     5    10
#> # … with 143 more rows

Created on 2023-04-06 with reprex v2.0.2

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.2.3 (2023-03-15) #> os macOS Ventura 13.2 #> system aarch64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz Australia/Hobart #> date 2023-04-06 #> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.0) #> backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0) #> broom 1.0.3 2023-01-25 [1] CRAN (R 4.2.0) #> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.2.0) #> cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0) #> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0) #> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.0) #> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0) #> dbplyr 2.3.0 2023-01-16 [1] CRAN (R 4.2.0) #> digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.0) #> dplyr * 1.1.0 2023-01-29 [1] CRAN (R 4.2.1) #> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0) #> evaluate 0.20 2023-01-17 [1] CRAN (R 4.2.0) #> fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.0) #> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.0) #> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.2.0) #> fs 1.6.1 2023-02-06 [1] CRAN (R 4.2.0) #> gargle 1.3.0 2023-01-30 [1] CRAN (R 4.2.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0) #> ggplot2 * 3.4.1 2023-02-10 [1] CRAN (R 4.2.0) #> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0) #> googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.2.0) #> googlesheets4 1.0.1 2022-08-13 [1] CRAN (R 4.2.0) #> gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.0) #> haven 2.5.1 2022-08-22 [1] CRAN (R 4.2.0) #> hms 1.1.2 2022-08-19 [1] CRAN (R 4.2.0) #> htmltools 0.5.4 2022-12-07 [1] CRAN (R 4.2.0) #> httr 1.4.4 2022-08-17 [1] CRAN (R 4.2.0) #> jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.2.0) #> knitr 1.42 2023-01-25 [1] CRAN (R 4.2.0) #> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0) #> lubridate 1.9.1 2023-01-24 [1] CRAN (R 4.2.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0) #> modelr 0.1.10 2022-11-11 [1] CRAN (R 4.2.0) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0) #> naniar 1.0.0 2023-02-02 [1] CRAN (R 4.2.0) #> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0) #> purrr * 1.0.1 2023-01-10 [1] CRAN (R 4.2.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0) #> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0) #> R.utils 2.12.2 2022-11-11 [1] CRAN (R 4.2.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0) #> readr * 2.1.3 2022-10-01 [1] CRAN (R 4.2.0) #> readxl 1.4.1 2022-08-17 [1] CRAN (R 4.2.0) #> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0) #> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0) #> rmarkdown 2.20 2023-01-19 [1] CRAN (R 4.2.0) #> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.0) #> rvest 1.0.3 2022-08-19 [1] CRAN (R 4.2.0) #> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0) #> stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0) #> stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.2.0) #> styler 1.9.0 2023-01-15 [1] CRAN (R 4.2.0) #> tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.2.0) #> tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.2.0) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0) #> tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.2.0) #> timechange 0.2.0 2023-01-11 [1] CRAN (R 4.2.0) #> tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.0) #> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0) #> vctrs 0.5.2 2023-01-23 [1] CRAN (R 4.2.0) #> visdat 0.6.0 2023-02-02 [1] local #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0) #> xfun 0.37 2023-01-31 [1] CRAN (R 4.2.0) #> xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.0) #> yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0) #> #> [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```

There could also be equivalent which_n_miss_row/var functions