Open Bisaloo opened 1 year ago
Great idea! I find myself fixing this issue in code quite often.
Small aside: for efficiency benchmarks, I would blow up the sample size substantially. At microsecond scale, it can be hard to discern true differences vs. overheads.
Here's a comparison at length(l) = 1e7
:
microbenchmark::microbenchmark(
l == "a" | l == "e" | l == "i" | l == "o" | l == "u" | l == "y",
l %in% c("a", "e", "i", "o", "u", "y"),
times = 30L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# l == "a" | l == "e" | l == "i" | l == "o" | l == "u" | l == "y" 653.6565 724.2831 832.0146 771.3028 955.4526 1221.4486 30
# l %in% c("a", "e", "i", "o", "u", "y") 347.1431 362.0270 431.4648 403.1225 486.3908 650.0158 30
microbenchmark::microbenchmark(
l == "a" | l == "e",
l %in% c("a", "e"),
times = 30L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# l == "a" | l == "e" 170.2372 186.2501 209.7021 201.9745 209.7413 319.8492 30
# l %in% c("a", "e") 305.9194 341.7036 379.8110 365.9685 405.7257 470.1919 30
Note that efficiency is not actually better for random input when there's only two or three comparisons (somewhat surprisingly... my guess is that %in%
is not actually optimized because it's a wrapper to match(x, tbl, 0L) > 0L
)
Here is another context which will fall under the purview of this linter (with an example inspired by R Inferno):
x <- 1:6
x == c(1, 3) # wrong answer, and no warning!
#> [1] TRUE FALSE FALSE FALSE FALSE FALSE
x %in% c(1, 3)
#> [1] TRUE FALSE TRUE FALSE FALSE FALSE
x <- 1:5
x == c(1, 3) # wrong answer
#> Warning in x == c(1, 3): longer object length is not a multiple of shorter
#> object length
#> [1] TRUE FALSE FALSE FALSE FALSE
x %in% c(1, 3)
#> [1] TRUE FALSE TRUE FALSE FALSE
Created on 2023-01-08 with reprex v2.0.2
Note that == c(...)
is not equivalent to %in% c(...)
, so we should be careful to only lint code where we have an equivalent but better way.
e.g. all(x == c(1, 42))
could be identical(x, c(1, 42))
unless x
has attributes.
Some more data on the performance aspect suggests that performance is better for OR up to 4 comparisons
size <- 1e7
seed <- 1304L
max_cmp <- 5L
l <- withr::with_seed(seed, sample(letters, size = size, replace = TRUE))
for (n_compare in seq_len(max_cmp - 1L) + 1L) {
cmps <- head(letters, n_compare)
code_or <- parse(text = paste0("l == '", cmps, "'", collapse = " | "))
code_in <- parse(text = paste0("l %in% ", deparse(cmps)))
cat("## ", n_compare, " comparisons:\n", sep = "")
print(microbenchmark::microbenchmark(
eval(code_or),
eval(code_in),
times = 30L
))
}
Output on my machine:
## 2 comparisons:
Unit: milliseconds
expr min lq mean median uq max neval
eval(code_or) 82.4855 86.00741 100.1685 87.53687 91.87537 213.0834 30
eval(code_in) 188.8862 189.71899 206.4853 197.26287 205.77282 346.6441 30
## 3 comparisons:
Unit: milliseconds
expr min lq mean median uq max neval
eval(code_or) 137.3645 141.0620 158.0298 146.7602 152.1800 272.5264 30
eval(code_in) 162.9575 167.5618 179.9364 172.1156 175.7268 300.1927 30
## 4 comparisons:
Unit: milliseconds
expr min lq mean median uq max neval
eval(code_or) 195.8201 202.1508 234.6512 209.3182 222.9131 344.1707 30
eval(code_in) 219.4884 228.0814 244.5046 231.6262 237.4547 356.3105 30
## 5 comparisons:
Unit: milliseconds
expr min lq mean median uq max neval
eval(code_or) 254.9612 260.1862 301.2312 264.9692 377.2042 391.6648 30
eval(code_in) 156.6183 160.8161 168.9001 162.6207 167.6647 302.5179 30
The same seems to be true also for their memory footprints, but only for 2 comparisons:
size <- 1e7
seed <- 1304L
max_cmp <- 5L
l <- withr::with_seed(seed, sample(letters, size = size, replace = TRUE))
for (n_compare in seq_len(max_cmp - 1L) + 1L) {
cmps <- head(letters, n_compare)
code_or <- parse(text = paste0("l == '", cmps, "'", collapse = " | "))
code_in <- parse(text = paste0("l %in% ", deparse(cmps)))
cat("## ", n_compare, " comparisons: -------------------\n", sep = "")
print(bench::mark(
"or" = eval(code_or),
"in" = eval(code_in),
min_iterations = 30L,
filter_gc = FALSE,
)[1:5])
cat("\n")
}
#> ## 2 comparisons: -------------------
#> # A tibble: 2 × 5
#> expression min median `itr/sec` mem_alloc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
#> 1 or 81.5ms 84.1ms 11.6 114MB
#> 2 in 156ms 162.8ms 6.01 153MB
#>
#> ## 3 comparisons: -------------------
#> # A tibble: 2 × 5
#> expression min median `itr/sec` mem_alloc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
#> 1 or 137ms 139ms 7.09 191MB
#> 2 in 143ms 146ms 6.72 153MB
#>
#> ## 4 comparisons: -------------------
#> # A tibble: 2 × 5
#> expression min median `itr/sec` mem_alloc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
#> 1 or 194ms 201ms 4.66 267MB
#> 2 in 167ms 174ms 5.59 153MB
#>
#> ## 5 comparisons: -------------------
#> # A tibble: 2 × 5
#> expression min median `itr/sec` mem_alloc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
#> 1 or 251ms 259ms 3.77 343MB
#> 2 in 148ms 156ms 6.20 153MB
Created on 2024-06-25 with reprex v2.1.0
Just to have a better view of this:
library(ggplot2)
size <- 1e7
seed <- 1304
max_cmp <- 10
l <- withr::with_seed(seed, sample(letters, size = size, replace = TRUE))
res <- bench::press(
n_compare = 1:10,
{
cmps <- head(letters, n_compare)
code_or <- parse(text = paste0("l == '", cmps, "'", collapse = " | "))
code_in <- parse(text = paste0("l %in% ", deparse(cmps)))
bench::mark(
"or" = eval(code_or),
"in" = eval(code_in),
min_iterations = 20,
filter_gc = FALSE,
)
}
)
#> Running with:
#> n_compare
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
#> 6 6
#> 7 7
#> 8 8
#> 9 9
#> 10 10
ggplot(res, aes(n_compare, median, color = as.character(expression))) +
geom_line() +
geom_point() +
labs(y = "Seconds", color = "")
ggplot(res, aes(n_compare, mem_alloc/1e6, color = as.character(expression))) +
geom_line() +
geom_point() +
labs(y = "Memory (MB)", color = "")
I sometimes see code like the following:
It would be good to recommend using
%in%
in this situation.%in%
is more readable and more efficient.Created on 2022-12-23 with reprex v2.0.2.9000