Open richfitz opened 7 years ago
For storr
s populated with a lot of tiny objects, key mangling accounts for about half of the overhead. The slowdown severely affects drake
workflows with large numbers of tiny targets with lots of dependencies. Here are some benchmarks for storr
.
library(microbenchmark)
library(storr)
n <- 1000
cache_mangle <- storr::storr_rds(tempfile(), mangle_key = TRUE)
cache_no_mangle <- storr::storr_rds(tempfile(), mangle_key = FALSE)
for (i in seq_len(n)){
cache_mangle$set(key = as.character(i), value = i)
cache_no_mangle$set(key = as.character(i), value = i)
}
overhead <- function(cache, n){
for (i in seq_len(n)){
cache$get(key = as.character(i))
}
}
benchmarks <- microbenchmark(
overhead(cache_mangle, n),
overhead(cache_no_mangle, n)
)
print(benchmarks)
#> Unit: milliseconds
#> expr min lq mean median
#> overhead(cache_mangle, n) 98.03836 104.64194 108.22292 107.92887
#> overhead(cache_no_mangle, n) 62.32374 65.64005 67.20944 66.88925
#> uq max neval cld
#> 111.33038 136.31826 100 b
#> 68.23469 95.06846 100 a
Created on 2018-11-03 by the reprex package (v0.2.1)
And for drake
(modified from this example, downloadable with drake::drake_example("overhead")
).
{
library(drake)
library(microbenchmark)
library(storr)
plan <- function(n) {
plan <- drake_plan(target_1 = 1)
for (i in seq_len(n - 1) + 1) {
target <- paste0("target_", i)
dependencies <- paste0("target_", seq_len(i - 1))
command <- paste0("max(", paste0(dependencies, collapse = ", "),
")")
plan <- rbind(plan, data.frame(target = target, command = command))
}
plan
}
test_plan <- plan(n = 1000)
overhead <- function(config) {
make(config = config)
clean(cache = config$cache)
}
cache_mangle <- storr::storr_rds(tempfile(), mangle_key = TRUE)
cache_no_mangle <- storr::storr_rds(tempfile(), mangle_key = FALSE)
config_mangle <- drake_config(test_plan, cache = cache_mangle,
verbose = FALSE)
config_no_mangle <- drake_config(test_plan, cache = cache_no_mangle,
verbose = FALSE)
benchmarks <- microbenchmark(overhead(config_mangle), overhead(config_no_mangle),
times = 10)
print(benchmarks)
}
#> Unit: seconds
#> expr min lq mean median
#> overhead(config_mangle) 99.51634 107.42162 124.07111 126.44001
#> overhead(config_no_mangle) 59.67658 61.17046 68.31911 66.98202
#> uq max neval cld
#> 140.46350 147.2752 10 b
#> 74.20764 80.7402 10 a
Created on 2018-11-03 by the reprex package (v0.2.1)
With a naive insertion of base64url::base64_urlencode()
, i.e.
encode64 <- function(x, char62 = "-", char63 = "_", pad = TRUE) {
return(base64url::base64_urlencode(x))
}
there is a speedup.
library(microbenchmark)
library(storr)
n <- 1000
cache_mangle <- storr::storr_rds(tempfile(), mangle_key = TRUE)
cache_no_mangle <- storr::storr_rds(tempfile(), mangle_key = FALSE)
for (i in seq_len(n)){
cache_mangle$set(key = as.character(i), value = i)
cache_no_mangle$set(key = as.character(i), value = i)
}
overhead <- function(cache, n){
for (i in seq_len(n)){
cache$get(key = as.character(i))
}
}
benchmarks <- microbenchmark(
overhead(cache_mangle, n),
overhead(cache_no_mangle, n)
)
print(benchmarks)
#> Unit: milliseconds
#> expr min lq mean median uq
#> overhead(cache_mangle, n) 76.86742 82.28243 84.44084 84.30136 86.61499
#> overhead(cache_no_mangle, n) 63.01152 66.57724 69.32338 68.83174 70.57834
#> max neval cld
#> 93.08793 100 b
#> 106.44841 100 a
Created on 2018-11-03 by the reprex package (v0.2.1)
We still retain the speed improvement even with compatibility measures
encode64 <- function(x, char62 = "-", char63 = "_", pad = TRUE) {
if (length(x) != 1L) {
return(vcapply(x, encode64, char62, char63, pad, USE.NAMES = FALSE))
}
out <- base64url::base64_urlencode(x)
if (!identical(char62, "-")) {
gsub(pattern = "-", replacement = char62, x = out, fixed = TRUE)
}
if (!identical(char63, "-")) {
gsub(pattern = "-", replacement = char62, x = out, fixed = TRUE)
}
if (pad) {
x <- as.integer(charToRaw(x))
n_bytes <- length(x)
n_blocks <- ceiling(n_bytes / 3L)
n_pad <- 3L * n_blocks - n_bytes
char_pad <- replicate(n_pad, "=")
out <- paste(c(out, char_pad), collapse = "")
}
out
}
New times:
#> Unit: milliseconds
#> expr min lq mean median uq
#> overhead(cache_mangle, n) 83.93953 90.30987 94.06254 94.10564 97.07974
#> overhead(cache_no_mangle, n) 62.71232 65.66816 68.68043 68.64221 70.61832
#> max neval cld
#> 123.58915 100 b
#> 95.26984 100 a
Created on 2018-11-03 by the reprex package (v0.2.1)
Practically char62 and char63 are always -
and _
for us, so drop the conditional and use chartr
rather than gsub
perhaps?
If we just remove the conditionals, we are about as good as base64url
. If we switching to chartr()
, overhead does not seem to change.
encode64 <- function(x, char62 = "-", char63 = "_", pad = TRUE) {
if (length(x) != 1L) {
return(vcapply(x, encode64, char62, char63, pad, USE.NAMES = FALSE))
}
out <- base64url::base64_urlencode(x)
if (pad) {
x <- as.integer(charToRaw(x))
n_bytes <- length(x)
n_blocks <- ceiling(n_bytes / 3L)
n_pad <- 3L * n_blocks - n_bytes
char_pad <- replicate(n_pad, "=")
out <- paste(c(out, char_pad), collapse = "")
}
out
}
#> Unit: milliseconds
#> expr min lq mean median uq
#> overhead(cache_mangle, n) 79.57784 83.43125 86.72935 85.56367 88.61894
#> overhead(cache_no_mangle, n) 61.83847 65.08615 67.55589 67.25665 69.41735
#> max neval cld
#> 123.61545 100 b
#> 77.66569 100 a
encode64 <- function(x, char62 = "-", char63 = "_", pad = TRUE) {
if (length(x) != 1L) {
return(vcapply(x, encode64, char62, char63, pad, USE.NAMES = FALSE))
}
out <- base64url::base64_urlencode(x)
out <- chartr(old = "-", new = char62, x = out)
out <- chartr(old = "_", new = char63, x = out)
if (pad) {
x <- as.integer(charToRaw(x))
n_bytes <- length(x)
n_blocks <- ceiling(n_bytes / 3L)
n_pad <- 3L * n_blocks - n_bytes
char_pad <- replicate(n_pad, "=")
out <- paste(c(out, char_pad), collapse = "")
}
out
}
#> Unit: milliseconds
#> expr min lq mean median uq
#> overhead(cache_mangle, n) 86.07183 91.17242 95.41442 94.65800 98.49878
#> overhead(cache_no_mangle, n) 63.39793 67.09762 69.27319 68.52905 71.13439
#> max neval cld
#> 123.23109 100 b
#> 97.93058 100 a
See #43, and the timings in this comment