Open nathaneastwood opened 1 year ago
As a result of 3cc0a99:
r$> size <- 10000
r$> t1 <- system.time({
d <- data.frame(
g1 = sample(LETTERS, size, TRUE),
g2 = sample(LETTERS, size, TRUE),
g3 = sample(LETTERS, size, TRUE),
x1 = runif(size),
x2 = runif(size),
x3 = runif(size)
)
d %>% group_by(g1, g2, g3) %>% summarise(x1 = mean(x1), x2 = max(x2), x3 = min(x3))
})
`summarise()` has grouped output by 'g1', 'g2'. You can override using the `.groups` argument.
r$> devtools::load_all()
ℹ Loading poorman
I'd seen my father. He was a poor man, and I watched him do astonishing things.
- Sidney Poitier
r$> t2 <- system.time({
d <- data.frame(
g1 = sample(LETTERS, size, TRUE),
g2 = sample(LETTERS, size, TRUE),
g3 = sample(LETTERS, size, TRUE),
x1 = runif(size),
x2 = runif(size),
x3 = runif(size)
)
d %>% group_by(g1, g2, g3) %>% summarise(x1 = mean(x1), x2 = max(x2), x3 = min(x3))
})
`summarise()` has grouped output by 'g1', 'g2'. You can override using the `.groups` argument.
r$> t3 <- system.time({
d <- data.frame(
g1 = sample(LETTERS, size, TRUE),
g2 = sample(LETTERS, size, TRUE),
g3 = sample(LETTERS, size, TRUE),
x1 = runif(size),
x2 = runif(size),
x3 = runif(size)
)
d %>% dplyr::group_by(g1, g2, g3) %>% dplyr::summarise(x1 = mean(x1), x2 = max(x2), x3 = min(x3))
})
`summarise()` has grouped output by 'g1', 'g2'. You can override using the `.groups` argument.
r$> t1
user system elapsed
110.639 9.898 122.772
r$> t2
user system elapsed
19.394 1.020 22.994
r$> t3
user system elapsed
0.492 0.014 0.562
Improvements can still be made.
The following produces the results in ~9 seconds:
calculate_groups <- function(data, groups, drop = group_by_drop_default(data)) {
data <- ungroup(data)
unknown <- setdiff(groups, colnames(data))
if (length(unknown) > 0L) {
stop(sprintf("`groups` missing from `data`: %s.", paste0(groups, collapse = ", ")))
}
data[, ".rows"] <- seq_len(nrow(data))
form <- as.formula(paste(".rows", paste0(groups, collapse = " + "), sep = " ~ "))
res <- aggregate(form, data, function(x) x, simplify = FALSE, na.action = na.pass)
unique_groups <- unique(data[, groups, drop = FALSE])
is_factor <- do.call(c, lapply(unique_groups, function(x) is.factor(x)))
if (!isTRUE(drop) && any(is_factor)) {
na_lvls <- do.call(
expand.grid,
lapply(unique_groups, function(x) if (is.factor(x)) levels(x)[!(levels(x) %in% x)] else NA)
)
if (nrow(na_lvls) > 0L) {
rows <- res$.rows
res <- bind_rows(res, na_lvls)
res$.rows <- c(rows, rep(list(integer(0)), nrow(na_lvls)))
}
}
res <- res[do.call(order, lapply(groups, function(x) res[, x])), , drop = FALSE]
rownames(res) <- NULL
attr(res, ".drop") <- drop
res
}
So still a lot slower than dplyr
. It also suffers from the problem that aggregate()
doesn't keep NA
values so the following wouldn't work since the NA
group level would be removed.
data.frame(x = c("apple", NA, "banana"), y = 1:3, stringsAsFactors = FALSE) %>%
group_by(x) %>%
group_data()
It also suffers from the problem that aggregate() doesn't keep NA values so the following wouldn't work since the NA group level would be removed.
Related to this, in the current version NA
are a group, as in dplyr
, but their group is not taken into account in summarize()
:
set.seed(2023)
d <- data.frame(
g1 = sample(c(LETTERS[1:3], NA), 10, replace = TRUE),
g2 = sample(c(LETTERS[1:3], NA), 10, replace = TRUE),
v1 = sample(1:10)
)
d
#> g1 g2 v1
#> 1 A A 4
#> 2 <NA> A 5
#> 3 C A 8
#> 4 A A 1
#> 5 <NA> A 7
#> 6 C A 2
#> 7 B <NA> 10
#> 8 <NA> C 9
#> 9 B B 3
#> 10 A C 6
d |>
poorman::group_by(g1) |>
poorman::group_data()
#> g1 .rows
#> 1 A 1, 4, 10
#> 2 B 7, 9
#> 3 C 3, 6
#> 4 <NA> 2, 5, 8
d |>
poorman::group_by(g1) |>
poorman::summarize(mean = mean(v1))
#> g1 mean
#> 1 A 3.666667
#> 2 B 6.500000
#> 3 C 5.000000
d |>
dplyr::group_by(g1) |>
dplyr::summarize(mean = mean(v1))
#> # A tibble: 4 × 2
#> g1 mean
#> <chr> <dbl>
#> 1 A 3.67
#> 2 B 6.5
#> 3 C 5
#> 4 <NA> 7
Created on 2023-01-05 with reprex v2.0.2
Good catch. I think at some point I will need to pin poorman
's ambition to a specific version of dplyr
. It's entirely possible that at the time of developing poorman
, this wasn't a feature of dplyr
, but then it's also possible it was and I just didn't capture this.
Right, it would make a lot of sense given how much dplyr
evolves (e.g the future .by
argument)
That's the main incentive, yeah. It would require a lot of work to keep up and probably a lot of refactoring. My suggestion would therefore be dplyr 1.0.0
since I believe .by
was implemented in 1.1.0
?
Yes it's in the upcoming 1.1.0
Coming back to the slowness issue, I can gain ~30% speed by replacing:
for (i in seq_len(n_comb)) {
rows[[i]] <- which(data_groups %in% interaction(unique_groups[i, groups]))
}
by
pasted_groups <- do.call(paste, c(unique_groups[, groups, drop = FALSE], sep = "."))
pasted_groups[is.na(unique_groups)] <- NA
for (i in seq_len(n_comb)) {
rows[[i]] <- which(data_groups %in% pasted_groups[i])
}
Basically, at each iteration, interaction()
pastes all values of a given row together, which can be done outside the loop and quicker than interaction()
.
This change passes all the tests (which are not really complete given the NA issue).
Benchmark setup:
d <- data.frame(
g1 = sample(LETTERS, 4000, TRUE),
g2 = sample(LETTERS, 4000, TRUE),
g3 = sample(LETTERS, 4000, TRUE),
x1 = runif(4000),
x2 = runif(4000),
x3 = runif(4000)
)
# return a list of results so that both functions return the same output (without
# all the class problem, tibble vs data.frame)
poor = function() {
foo <- d %>%
group_by(g1, g2, g3) |>
summarise(x1 = mean(x1), x2 = max(x2), x3 = min(x3))
list(foo$x1, foo$x2, foo$x3)
}
dpl = function() {
foo <- d %>%
dplyr::group_by(g1, g2, g3) |>
dplyr::summarise(x1 = mean(x1), x2 = max(x2), x3 = min(x3))
list(foo$x1, foo$x2, foo$x3)
}
bench::mark(
poor(),
dpl()
)
Oh that's really nice. Did you want to submit a PR? I probably won't have time to implement it myself until Sunday.
I can make a PR but I think it shouldn't be implemented before the behavior of NA
in groups is fixed (or clarified). Currently there's only one test in test_group_by.R
that deals with NA
, and it only checks when grouping by one variable, so I'm not super confident this change won't break existing code.
There should probably be some tests for more complex grouping with NA
.