tidyverse / funs

Collection of low-level functions for working with vctrs
Other
34 stars 7 forks source link

Implement lead and lag #34

Open hadley opened 4 years ago

hadley commented 4 years ago

Start from @DavisVaughan

lag <- function (x, n = 1L) {
  vec_assert(x)
  n <- check_n(n)

  if (n == 0L) {
    return(x)
  }

  size <- vec_size(x)
  n <- pmin(n, size)

  new <- vec_init(x, n)
  old <- vec_slice(x, seq_len(size - n))

  vec_c(new, old)
}

#' @export
#' @rdname lag
lead <- function (x, n = 1L) {
  vec_assert(x)
  n <- check_n(n)

  if (n == 0L) {
    return(x)
  }

  size <- vec_size(x)
  n <- pmin(n, size)

  new <- vec_init(x, n)
  old <- vec_slice(x, -seq_len(n))

  vec_c(old, new)
}

check_n <- function(n) {
  n <- vec_cast(n, integer())
  vec_assert(n, size = 1L)

  if (n < 0L) {
    abort("`n` must be positive.")
  }
  n
}
hadley commented 4 years ago

Can more efficiently implement this (for the default case where default is a missing value) by using vec_slice(x, c(NA, idx)) etc.

DavisVaughan commented 4 years ago

Adding a note that pmin() is much slower than min(). I don't think we need it here

bench::mark(pmin(1, 2), min(1, 2))
#> # A tibble: 2 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 pmin(1, 2)   4.71µs   5.49µs   178269.    80.8KB     107.
#> 2 min(1, 2)     232ns    341ns  2773155.        0B       0

Created on 2020-02-11 by the reprex package (v0.3.0)

hadley commented 4 years ago

And consider unify into a single function, as in https://github.com/tidyverse/dplyr/issues/5260

DavisVaughan commented 4 years ago

shift()

DavisVaughan commented 3 years ago

Potential full implementation using vec_shift() as the base for vec_lag() and vec_lead() (we can drop the vec_)

library(vctrs)
library(rlang)

vec_lag <- function(x, n = 1L, default = NULL, order_by = NULL) {
  vec_assert(n, size = 1L, arg = "n")
  n <- vec_cast(n, integer(), x_arg = "n")

  if (n < 0L) {
    abort("`n` must be positive.")
  }

  vec_shift(x, n, default, order_by)
}

vec_lead <- function(x, n = 1L, default = NULL, order_by = NULL) {
  vec_assert(n, size = 1L, arg = "n")
  n <- vec_cast(n, integer(), x_arg = "n")

  if (n < 0L) {
    abort("`n` must be positive.")
  }

  n <- n * -1L

  vec_shift(x, n, default, order_by)
}

vec_shift <- function(x, n = 1L, default = NULL, order_by = NULL) {
  size <- vec_size(x)

  if (!is.null(order_by)) {
    out <- with_order(x, order_by, size, vec_shift, n = n, default = default)
    return(out)
  }

  vec_assert(n, size = 1L, arg = "n")
  n <- vec_cast(n, integer(), x_arg = "n")

  if (identical(n, 0L)) {
    return(x)
  }

  lag <- sign(n) > 0L
  n <- abs(n)

  if (n > size) {
    n <- size
  }

  if (is.null(default)) {
    vec_shift_slice(x, n, size, lag)
  } else {
    vec_shift_c(x, n, size, lag, default)
  }
}

vec_shift_slice <- function(x, n, size, lag) {
  idx_default <- vec_rep(NA_integer_, n)

  if (lag) {
    idx <- seq2(1L, size - n)
    idx <- c(idx_default, idx)
    vec_slice(x, idx)
  } else {
    idx <- seq2(1L + n, size)
    idx <- c(idx, idx_default)
    vec_slice(x, idx)
  }
}

vec_shift_c <- function(x, n, size, lag, default) {
  vec_assert(default, size = 1L, arg = "default")
  default <- vec_cast(default, x, x_arg = "default")

  default <- vec_rep(default, n)

  if (lag) {
    idx <- seq2(1L, size - n)
    x <- vec_slice(x, idx)
    vec_c(default, x)
  } else {
    idx <- seq2(1L + n, size)
    x <- vec_slice(x, idx)
    vec_c(x, default)
  }
}

with_order <- function(.x, .order_by, .size, .fn, ...) {
  vec_assert(.order_by, size = .size)
  o <- vec_order(.order_by)
  x <- vec_slice(.x, o)
  out <- .fn(x, ...)
  vec_slice(out, vec_order(o))
}

Also fixes two issues with current dplyr version:

# shouldnt return size 1
dplyr::lag(1:5, order_by = 1)
#> [1] NA

vec_lag(1:5, order_by = 1)
#> Error: `.order_by` must have size 5, not size 1.

# should cast default->x, not take common type
class(dplyr::lag(1:5, default = NA_real_))
#> [1] "numeric"

class(vec_lag(1:5, default = NA_real_))
#> [1] "integer"