njtierney / naniar

Tidy data structures, summaries, and visualisations for missing data
http://naniar.njtierney.com/
Other
651 stars 53 forks source link

group_by() with miss_var_summary() is slow #256

Closed njtierney closed 4 years ago

njtierney commented 4 years ago

Using summarise_all instead of the dispatch to tidyr::nest() approach is about 10x faster:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(naniar)
library(tidyr)

# new approach
miss_var_summary2 <- function(data){
  UseMethod("miss_var_summary2")
}

miss_var_summary2.default <- function(data){
  data %>% 
    summarise_all(~ n_miss(.x)) %>% 
    pivot_longer(cols = everything(),
                 names_to = "variable", 
                 values_to = "n_miss"
    ) %>% 
    dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% 
    dplyr::arrange(-n_miss)
}

miss_var_summary2.grouped_df <- function(data){
  data %>% 
    summarise_all(~ n_miss(.x)) %>% 
    pivot_longer(cols = - dplyr::group_vars(data),
                 names_to = "variable", 
                 values_to = "n_miss"
    ) %>% 
    dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% 
    dplyr::arrange(-n_miss)
}

bm1 <- bench::mark(
  original = miss_var_summary(who),
  new = miss_var_summary2(who)
)

summary(bm1, relative = TRUE)
#> # A tibble: 2 x 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 original    1.37   1.26      1         1.19     3.13
#> 2 new         1      1         1.28      1        1

who_group <- who %>% group_by(year)

bm_group <- bench::mark(
  original = miss_var_summary(who_group),
  new = miss_var_summary2(who_group),
  check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.

summary(bm_group, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 original    10.7   10.4       1        1.67     1   
#> 2 new          1      1        10.2      1        1.27

Created on 2020-05-13 by the reprex package (v0.3.0)

The current dispatch to tidyr::nest() was really nice, since it meant that the grouped_df methods were really simplified.

The main challenger here is that I need to work out a way to program over pivot_longer, so that the group is ignored in the pivot.

njtierney commented 4 years ago

After the changes in #258,

It seems that this is no longer needed:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(naniar)
library(tidyr)

# new approach
miss_var_summary2 <- function(data){
  UseMethod("miss_var_summary2")
}

miss_var_summary2.default <- function(data){
  data %>% 
    summarise_all(~ n_miss(.x)) %>% 
    pivot_longer(cols = everything(),
                 names_to = "variable", 
                 values_to = "n_miss"
    ) %>% 
    dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% 
    dplyr::arrange(-n_miss)
}

miss_var_summary2.grouped_df <- function(data){
  data %>% 
    summarise_all(~ n_miss(.x)) %>% 
    pivot_longer(cols = - dplyr::group_vars(data),
                 names_to = "variable", 
                 values_to = "n_miss"
    ) %>% 
    dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>% 
    dplyr::arrange(-n_miss)
}

bm1 <- bench::mark(
  original = miss_var_summary(who),
  new = miss_var_summary2(who)
)

summary(bm1, relative = TRUE)
#> # A tibble: 2 x 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 original    1      1         3.53      2.82     1.70
#> 2 new         4.10   3.75      1         1        1

who_group <- who %>% group_by(year)

bm_group <- bench::mark(
  original = miss_var_summary(who_group),
  new = miss_var_summary2(who_group),
  check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.

summary(bm_group)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 original     79.5ms   81.1ms     12.2    10.88MB     15.7
#> 2 new         152.3ms  153.4ms      6.41    3.87MB     25.6
plot(bm_group)

summary(bm_group, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 original    1      1         1.91      2.81     1   
#> 2 new         1.92   1.89      1         1        1.63

Created on 2020-05-19 by the reprex package (v0.3.0)