group_by() with miss_var_summary() is slow

njtierney / naniar

Tidy data structures, summaries, and visualisations for missing data

Other

651 stars 53 forks source link

Using summarise_all instead of the dispatch to tidyr::nest() approach is about 10x faster:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(naniar)
library(tidyr)

# new approach
miss_var_summary2 <- function(data){
  UseMethod("miss_var_summary2")
}

miss_var_summary2.default <- function(data){
  data %>% 
    summarise_all(~ n_miss(.x)) %>% 
    pivot_longer(cols = everything(),
                 names_to = "variable", 
                 values_to = "n_miss"
    ) %>% 
    dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% 
    dplyr::arrange(-n_miss)
}

miss_var_summary2.grouped_df <- function(data){
  data %>% 
    summarise_all(~ n_miss(.x)) %>% 
    pivot_longer(cols = - dplyr::group_vars(data),
                 names_to = "variable", 
                 values_to = "n_miss"
    ) %>% 
    dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% 
    dplyr::arrange(-n_miss)
}

bm1 <- bench::mark(
  original = miss_var_summary(who),
  new = miss_var_summary2(who)
)

summary(bm1, relative = TRUE)
#> # A tibble: 2 x 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 original    1.37   1.26      1         1.19     3.13
#> 2 new         1      1         1.28      1        1

who_group <- who %>% group_by(year)

bm_group <- bench::mark(
  original = miss_var_summary(who_group),
  new = miss_var_summary2(who_group),
  check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.

summary(bm_group, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 original    10.7   10.4       1        1.67     1   
#> 2 new          1      1        10.2      1        1.27

^{Created on 2020-05-13 by the reprex package (v0.3.0)}

The current dispatch to tidyr::nest() was really nice, since it meant that the grouped_df methods were really simplified.

The main challenger here is that I need to work out a way to program over pivot_longer, so that the group is ignored in the pivot.

library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union library(naniar) library(tidyr) # new approach miss_var_summary2 <- function(data){ UseMethod("miss_var_summary2") } miss_var_summary2.default <- function(data){ data %>% summarise_all(~ n_miss(.x)) %>% pivot_longer(cols = everything(), names_to = "variable", values_to = "n_miss" ) %>% dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% dplyr::arrange(-n_miss) } miss_var_summary2.grouped_df <- function(data){ data %>% summarise_all(~ n_miss(.x)) %>% pivot_longer(cols = - dplyr::group_vars(data), names_to = "variable", values_to = "n_miss" ) %>% dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% dplyr::arrange(-n_miss) } bm1 <- bench::mark( original = miss_var_summary(who), new = miss_var_summary2(who) ) summary(bm1, relative = TRUE) #> # A tibble: 2 x 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 original 1 1 3.53 2.82 1.70 #> 2 new 4.10 3.75 1 1 1 who_group <- who %>% group_by(year) bm_group <- bench::mark( original = miss_var_summary(who_group), new = miss_var_summary2(who_group), check = FALSE ) #> Warning: Some expressions had a GC in every iteration; so filtering is disabled. summary(bm_group) #> Warning: Some expressions had a GC in every iteration; so filtering is disabled. #> # A tibble: 2 x 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> #> 1 original 79.5ms 81.1ms 12.2 10.88MB 15.7 #> 2 new 152.3ms 153.4ms 6.41 3.87MB 25.6 plot(bm_group)

summary(bm_group, relative = TRUE) #> Warning: Some expressions had a GC in every iteration; so filtering is disabled. #> # A tibble: 2 x 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 original 1 1 1.91 2.81 1 #> 2 new 1.92 1.89 1 1 1.63

njtierney / naniar

group_by() with miss_var_summary() is slow #256