hadley / grouperise

Explore the idea of "grouperised" functions
Other
13 stars 3 forks source link

Rle update #1

Open DavisVaughan opened 4 years ago

DavisVaughan commented 4 years ago

Note that this merges directly into the rle branch, not master

Updated to use vctrs::vec_group_rle()

Also at least makes the n2 > 1, n2 > 2, n2 > 3 cases work again, even though I think those were just experiments.

Really nice benefits with vec_group_rle() and a sorted input.

Still having the strange problem where a sorted input makes group_sum1() slower?

library(grouperise)
library(dplyr, warn.conflicts = FALSE)
library(data.table, warn.conflicts = FALSE)
library(vctrs)

group_sum_rle1 <- grouperise:::group_sum_rle1

x <- runif(1e7)
g <- rep(1:100, length = length(x))

bench::mark(
  group_sum1(x, g),
  group_sum_rle1(x, g),
  tapply(x, g, sum),
  check = FALSE,
  iterations = 20
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 3 x 6
#>   expression                min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>           <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 group_sum1(x, g)        252ms    275ms      3.63     217MB     6.72
#> 2 group_sum_rle1(x, g)    215ms    246ms      3.99     242MB     5.79
#> 3 tapply(x, g, sum)       371ms    418ms      2.39     395MB     4.18

g_sort <- sort(g)
g_sort_group <- new_group(g_sort, 100)

rle_sort <- vec_group_rle(g_sort)
rle_sort_group <- grouperise:::new_group_rle(field(rle_sort, "group"), field(rle_sort, "length"), attr(rle_sort, "n"))

bench::mark(
  group_sum1(x, g_sort),
  group_sum_rle1(x, g_sort),
  group_sum1(x, g_sort_group),
  group_sum_rle1(x, rle_sort_group),
  tapply(x, g_sort, sum),
  check = FALSE,
  iterations = 20
)
#> # A tibble: 5 x 6
#>   expression                            min  median `itr/sec` mem_alloc
#>   <bch:expr>                        <bch:t> <bch:t>     <dbl> <bch:byt>
#> 1 group_sum1(x, g_sort)             344.7ms 354.3ms      2.79   216.6MB
#> 2 group_sum_rle1(x, g_sort)         169.6ms 171.6ms      5.79   242.4MB
#> 3 group_sum1(x, g_sort_group)        21.8ms  22.5ms     44.4     38.1MB
#> 4 group_sum_rle1(x, rle_sort_group)  20.4ms  21.1ms     46.8       848B
#> 5 tapply(x, g_sort, sum)            388.9ms 388.9ms      2.57     395MB
#> # … with 1 more variable: `gc/sec` <dbl>

# benefits from here
bench::mark(vec_group_rle(g_sort), vec_group(g_sort), check = FALSE)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 x 6
#>   expression                 min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>            <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 vec_group_rle(g_sort)    163ms    166ms      5.50     242MB     5.50
#> 2 vec_group(g_sort)        327ms    338ms      2.95     178MB     1.48

Created on 2019-10-25 by the reprex package (v0.3.0.9000)