Rdatatable / data.table

R's data.table package extends data.frame:
http://r-datatable.com
Mozilla Public License 2.0
3.56k stars 974 forks source link

trunc.IDate/round.IDate behaviour #4335

Open artemklevtsov opened 4 years ago

artemklevtsov commented 4 years ago

Suggestions

Add trunc.IDate (may be replace round.IDate) to consists with lubridate::floor_date. trunc/floor for the week should return first day of the week and not relate with the day of year.

For the "weeks" unit add additional option for the start.on.monday (like cut.Date) or something like this.

Possible implementation:

trunc_week <- function(x, start.on.monday = TRUE) {
  if (isTRUE(start.on.monday)) {
    # lubridate::floor_date(x, "week", week_start = 1)
    as.IDate(7 * ((unclass(x) - 4L) %/% 7) + 4L)
  } else {
    # lubridate::floor_date(x, "week", week_start = 0)
    as.IDate(7 * ((unclass(x) - 3L) %/% 7) + 3L)
  }
}

Improvements

For the existing round.IDate implementation.

Utility function to print bench results:

print.bench = function(x, ...) {
  cols = c("expression", "n", "min", "median", "mem_alloc", "n_itr", "n_gc")
  r = as.data.table(x)[, ..cols]
  r[, expression :=  sapply(expression, deparse)]
  r[, relative := round(as.numeric(median / min(median)), 3), by = c("n")]
  r[]
}

Data to process.

set.seed(42)
x = seq.Date(as.Date("1900-01-01"), as.Date("2020-03-01"), by = "day")
x = as.IDate(x)

Weeks

trunc_week_old = function(x) {
  # IDateTime.R#81
  # bug: first week of year have 6 days instead 7
  # fix: should be yday(x) - 1L
  round(x, "year") + 7L * ((yday(x) - 1L) %/% 7L)
}

trunc_week_new = function(x) {
  l = as.POSIXlt(x)
  w = (l$yday %/% 7L) * 7L
  x - l$yday + w
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_week_old(d), trunc_week_new(d))
  }
)

print.bench(r)
#>            expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_week_old(d) 1e+02     225.62µs     232.11µs      146.73KB  1950     2    7.050
#>  2: trunc_week_new(d) 1e+02      30.57µs      32.92µs       40.53KB  9996     4    1.000
#>  3: trunc_week_old(d) 1e+03       1.39ms       1.47ms         213KB   318     2   13.605
#>  4: trunc_week_new(d) 1e+03     106.56µs     108.29µs        59.2KB  4151     4    1.000
#>  5: trunc_week_old(d) 1e+04      13.44ms      13.88ms        2.06MB    33     1   15.994
#>  6: trunc_week_new(d) 1e+04     859.88µs     867.72µs      586.55KB   534     5    1.000
#>  7: trunc_week_old(d) 1e+05      134.9ms     137.59ms        20.6MB     2     2   15.515
#>  8: trunc_week_new(d) 1e+05       8.39ms       8.87ms        5.72MB    41     9    1.000
#>  9: trunc_week_old(d) 1e+06        1.45s        1.45s         206MB     1     6   15.516
#> 10: trunc_week_new(d) 1e+06      88.27ms      93.29ms       57.22MB     6     6    1.000

Months

trunc_month_old = function(x) {
  # IDateTime.R#81
  as.IDate(ISOdate(year(x), month(x), 1L))
}
trunc_month_new = function(x) {
  x - as.POSIXlt(x)$mday + 1L
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_month_old(d), trunc_month_new(d))
  }
)

print.bench(r)
#>             expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                 <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_month_old(d) 1e+02     198.49µs     219.99µs        27.5KB  1947     2    7.332
#>  2: trunc_month_new(d) 1e+02      28.26µs         30µs        6.03KB  9996     4    1.000
#>  3: trunc_month_old(d) 1e+03       1.47ms        1.5ms      216.91KB   329     1   15.449
#>  4: trunc_month_new(d) 1e+03      94.05µs      96.94µs       55.25KB  4683     3    1.000
#>  5: trunc_month_old(d) 1e+04      13.93ms      14.39ms         2.1MB    35     0   17.223
#>  6: trunc_month_new(d) 1e+04     794.89µs     835.49µs      547.44KB   571     4    1.000
#>  7: trunc_month_old(d) 1e+05     139.84ms     141.22ms       20.98MB     3     1   17.648
#>  8: trunc_month_new(d) 1e+05       7.93ms          8ms        5.34MB    58     4    1.000
#>  9: trunc_month_old(d) 1e+06        1.42s        1.42s      209.81MB     1     4   17.221
#> 10: trunc_month_new(d) 1e+06      80.22ms      82.72ms       53.41MB     6     7    1.000

Qurters

trunc_quarter_old = function(x) {
  # IDateTime.R#82
  as.IDate(ISOdate(year(x), 3L * (quarter(x) - 1L) + 1L, 1L))
}
trunc_quarter_new = function(x) {
  l = as.POSIXlt(x)
  l$mon = (l$mon %/% 3L) * 3L
  l$mday = 1L
  as.IDate(l)
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_quarter_old(d), trunc_quarter_new(d))
  }
)

print.bench(r)
#>               expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                   <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_quarter_old(d) 1e+02        199µs     217.63µs       28.12KB  2213     2    6.289
#>  2: trunc_quarter_new(d) 1e+02      32.26µs      34.61µs       18.62KB  9995     5    1.000
#>  3: trunc_quarter_old(d) 1e+03       1.55ms       1.64ms      216.91KB   294     1    8.413
#>  4: trunc_quarter_new(d) 1e+03     177.79µs     194.43µs       98.64KB  2308     3    1.000
#>  5: trunc_quarter_old(d) 1e+04      15.15ms      15.46ms         2.1MB    31     1    8.443
#>  6: trunc_quarter_new(d) 1e+04       1.73ms       1.83ms      977.55KB   258     4    1.000
#>  7: trunc_quarter_old(d) 1e+05     153.45ms     153.83ms       20.98MB     2     2    8.313
#>  8: trunc_quarter_new(d) 1e+05      18.02ms       18.5ms        9.54MB    19     6    1.000
#>  9: trunc_quarter_old(d) 1e+06        1.59s        1.59s      209.81MB     1     5    6.672
#> 10: trunc_quarter_new(d) 1e+06     187.06ms     238.51ms       95.37MB     3     6    1.000

Years

trunc_year_old = function(x) {
  # IDateTime.R#83
  as.IDate(ISOdate(year(x), 1L, 1L))
}
trunc_year_new = function(x) {
  x - as.POSIXlt(x)$yday
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_year_old(d), trunc_year_new(d))
  }
)

print.bench(r)
#>            expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_year_old(d) 1e+02     178.84µs     184.22µs       17.56KB  2405     2    8.662
#>  2: trunc_year_new(d) 1e+02      19.81µs      21.27µs        5.59KB  9997     3    1.000
#>  3: trunc_year_old(d) 1e+03       1.26ms        1.3ms       161.7KB   375     0   14.864
#>  4: trunc_year_new(d) 1e+03      85.66µs      87.45µs        51.3KB  5431     3    1.000
#>  5: trunc_year_old(d) 1e+04      12.01ms      12.62ms        1.57MB    40     0   16.732
#>  6: trunc_year_new(d) 1e+04     743.28µs     754.02µs      508.33KB   578     3    1.000
#>  7: trunc_year_old(d) 1e+05     125.14ms     125.57ms       15.64MB     3     1   16.846
#>  8: trunc_year_new(d) 1e+05       7.38ms       7.45ms        4.96MB    64     3    1.000
#>  9: trunc_year_old(d) 1e+06        1.27s        1.27s       156.4MB     1     4   17.129
#> 10: trunc_year_new(d) 1e+06      73.92ms      74.31ms       49.59MB     7     3    1.000

Final trunc.IDate implementation:

trunc.IDate = function (x, units = c("weeks", "months", "quarters", "years"), start.on.monday = TRUE, ...) {
  units = match.arg(units)
  offset = if (isTRUE(start.on.monday)) 4L else 3L
  as.IDate(switch(units,
                  weeks = 7L * ((unclass(x) - offset) %/% 7L) + offset,
                  months = unclass(x) - as.POSIXlt(x)$mday + 1L,
                  quarters = {
                    l = as.POSIXlt(x)
                    l$mon = (l$mon %/% 3L) * 3L # trunc to quarter start month
                    l$mday = 1L # trunc month day
                    l
                  },
                  years = unclass(x) - as.POSIXlt(x)$yday))
}

Can I make the PR with this changes?

jangorecki commented 3 years ago

@artemklevtsov Sorry for late response. Yes, you are welcome to make PR, although I think it could be improved even further to avoid using POSIXlt and operate only on integer, not even POSIXct. Change will be of course more complex. I think @MichaelChirico used these kid of arithmetic recently when writing datetime parser for fread.

Also when pasting benchmark timings, please align units they are being presented in. Otherwise it is not easy to compare values across rows.

artemklevtsov commented 3 years ago

Helpful links for the date math: