Closed njtierney closed 4 years ago
After the changes in #258,
It seems that this is no longer needed:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(naniar)
library(tidyr)
# new approach
miss_var_summary2 <- function(data){
UseMethod("miss_var_summary2")
}
miss_var_summary2.default <- function(data){
data %>%
summarise_all(~ n_miss(.x)) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "n_miss"
) %>%
dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>%
dplyr::arrange(-n_miss)
}
miss_var_summary2.grouped_df <- function(data){
data %>%
summarise_all(~ n_miss(.x)) %>%
pivot_longer(cols = - dplyr::group_vars(data),
names_to = "variable",
values_to = "n_miss"
) %>%
dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100)) %>%
dplyr::arrange(-n_miss)
}
bm1 <- bench::mark(
original = miss_var_summary(who),
new = miss_var_summary2(who)
)
summary(bm1, relative = TRUE)
#> # A tibble: 2 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 original 1 1 3.53 2.82 1.70
#> 2 new 4.10 3.75 1 1 1
who_group <- who %>% group_by(year)
bm_group <- bench::mark(
original = miss_var_summary(who_group),
new = miss_var_summary2(who_group),
check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
summary(bm_group)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 original 79.5ms 81.1ms 12.2 10.88MB 15.7
#> 2 new 152.3ms 153.4ms 6.41 3.87MB 25.6
plot(bm_group)
summary(bm_group, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 original 1 1 1.91 2.81 1
#> 2 new 1.92 1.89 1 1 1.63
Created on 2020-05-19 by the reprex package (v0.3.0)
Using
summarise_all
instead of the dispatch totidyr::nest()
approach is about 10x faster:Created on 2020-05-13 by the reprex package (v0.3.0)
The current dispatch to
tidyr::nest()
was really nice, since it meant that thegrouped_df
methods were really simplified.The main challenger here is that I need to work out a way to program over
pivot_longer
, so that the group is ignored in the pivot.