Open jangorecki opened 8 years ago
Is this FR now resolved with CJ(X$a, Y$b)
?
@SymbolixAU my example might have been simplified too much, consider two data.tables having multiple columns, CJ
cannot address it.
I think I would do this in SQL as SELECT * FROM t1, t2
, is that different?
No, it is the same. Good practice is to use JOIN
keyword instead of FROM t1, t2
.
suggestion:
CJ.dft <- function(...) {
Reduce(f=function(x, y) cbind(x[rep(1:nrow(x), times=nrow(y)),], y[rep(1:nrow(y), each=nrow(x)),]),
x=list(...)[-1],
init=..1)
} #CJ.dft
#examples
CJ.dt(data.table(A1=1:2, A2=LETTERS[1:2]), data.table(B1=3:5, B2=LETTERS[3:5]), data.table(D1=6:9, D2=LETTERS[6:9]))
though i am not sure if this will be optimized.
Had a colleague looking for this today
shorthand (almost definitely not optimized) version can be:
DT1 = data.table(id = 1:3, v = rnorm(3))
DT2 = data.table(grp = 1:10, z = rnorm(10))
DT1[ , c(.SD, DT2), by = seq_len(nrow(DT1))]
a Reduce
approach could generalize IINM
@MichaelChirico That's a nice, intuitive solution. It's worth cautioning, though, that it will result in duplicate column names if any are shared between the data.tables. Based on some (very nonexhaustive, probably memory-dependent) benchmarking, your solution (CJ1
) is actually slightly faster than jangorecki's solution in the StackOverflow response (CJ2
, modified to be more comparable) but has similar performance if the temporary joining column is assigned with :=
. Of course, this is without manually taking into account keying and indexing behavior.
library(data.table)
DT1 <- data.table(V1 = rnorm(8E3))
DT2 <- data.table(V1 = rnorm(8E3))
# joining by grouping
CJ1 <- function(DT1, DT2){
DT1[ , c(.SD, DT2), by = seq_len(nrow(DT1))]
}
# joining by allow.cartesian with temporary column
CJ2 <- function(DT1, DT2){
DT1[, c(.SD, .tempCol = 1)][DT2[, c(.SD, .tempCol = 1)], on = ".tempCol", , allow.cartesian = T]
}
# joining by allow.cartesian with assigned column
CJ3 <- function(DT1, DT2){
DT1[, .tempCol := 1]
DT2[, .tempCol := 1]
DT1[DT2, on = ".tempCol", , allow.cartesian = T]
}
join1 <- CJ1(DT1, DT2)
print(join1)
# seq_len V1 V1
# 1: 1 -0.6752308 -1.19897585
# 2: 1 -0.6752308 -2.14232417
# 3: 1 -0.6752308 -0.01960594
# 4: 1 -0.6752308 0.92697292
# 5: 1 -0.6752308 1.26429973
# ---
# 63999996: 8000 1.1786346 -0.44214904
# 63999997: 8000 1.1786346 0.52729396
# 63999998: 8000 1.1786346 0.60022166
# 63999999: 8000 1.1786346 0.30667155
# 64000000: 8000 1.1786346 -0.77470274
join2 <- CJ2(DT1, DT2)
print(join2)
# V1 .tempCol i.V1
# 1: -0.67523085 1 -1.1989758
# 2: -0.92701043 1 -1.1989758
# 3: -0.99014335 1 -1.1989758
# 4: 0.53156215 1 -1.1989758
# 5: -0.15562568 1 -1.1989758
# ---
# 63999996: 0.39169500 1 -0.7747027
# 63999997: 1.94071323 1 -0.7747027
# 63999998: -0.08114057 1 -0.7747027
# 63999999: -0.31676249 1 -0.7747027
# 64000000: 1.17863463 1 -0.7747027
join3 <- CJ3(DT1, DT2)
print(join3)
# V1 .tempCol i.V1
# 1: -0.67523085 1 -1.1989758
# 2: -0.92701043 1 -1.1989758
# 3: -0.99014335 1 -1.1989758
# 4: 0.53156215 1 -1.1989758
# 5: -0.15562568 1 -1.1989758
# ---
# 63999996: 0.39169500 1 -0.7747027
# 63999997: 1.94071323 1 -0.7747027
# 63999998: -0.08114057 1 -0.7747027
# 63999999: -0.31676249 1 -0.7747027
# 64000000: 1.17863463 1 -0.7747027
# check results are the same
setnames(join1, which(names(join1) == "V1"), c("V1", "i.V1"))
join1 <- join1[, "seq_len" := NULL]
join2 <- join2[, ".tempCol" := NULL]
join3 <- join3[, ".tempCol" := NULL]
setorderv(join1, c("V1", "i.V1"))
setorderv(join2, c("V1", "i.V1"))
setorderv(join3, c("V1", "i.V1"))
identical(join1, join2)
# [1] TRUE
identical(join2, join3)
# [1] TRUE
# benchmark
microbenchmark::microbenchmark(
CJ1(DT1, DT2),
CJ2(DT1, DT2),
CJ3(DT1, DT2),
times = 5
)
# Unit: seconds
# expr min lq mean median uq max neval cld
# CJ1(DT1, DT2) 2.486950 2.638036 2.767990 2.822850 2.842497 3.049616 5 a
# CJ2(DT1, DT2) 1.743497 3.531119 3.751018 3.943120 4.090093 5.447262 5 a
# CJ3(DT1, DT2) 1.848768 2.728135 2.737173 2.822515 3.094846 3.191602 5 a
Another option:
CJ.dt_2 <- function(...) {
DTls <- list(...)
rows <- do.call(CJ, lapply(DTls, function(x) x[, seq_len(.N)]))
res <- DTls[[1L]][rows[[1L]]]
for (n in seq_along(DTls)[-1L])
res <- res[, c(.SD, DTls[[n]][rows[[n]]])]
res
}
timing code:
library(data.table) #data.table_1.12.4 Win-4 R-3.6.1 x64 4 threads
nr <- 3e2
DT1 <- data.table(A1=1:nr, A1=1:nr)
DT2 <- data.table(B1=1:nr, B2=1:nr)
DT3 <- data.table(C1=1:nr, C2=1:nr)
#https://github.com/Rdatatable/data.table/pull/814#issuecomment-55807497
CJ.dt = function(...) {
rows = do.call(CJ, lapply(list(...), function(x) if(is.data.frame(x)) seq_len(nrow(x)) else seq_along(x)));
do.call(data.table, Map(function(x, y) x[y], list(...), rows))
}
#https://github.com/Rdatatable/data.table/issues/1717#issuecomment-355499952
CJ.dt_1 <- function(...) {
Reduce(f=function(x, y) cbind(x[rep(1:nrow(x), times=nrow(y)),], y[rep(1:nrow(y), each=nrow(x)),]),
x=list(...))
} #CJ.dft
CJ.dt_2 <- function(...) {
DTls <- list(...)
rows <- do.call(CJ, lapply(DTls, function(x) x[, seq_len(.N)]))
res <- DTls[[1L]][rows[[1L]]]
for (n in seq_along(DTls)[-1L])
res <- res[, c(.SD, DTls[[n]][rows[[n]]])]
res
}
#https://github.com/Rdatatable/data.table/issues/2343#issuecomment-328156867
CJDT <- function(...)
Reduce(function(DT1, DT2) cbind(DT1, DT2[rep(1:.N, each=nrow(DT1))]), list(...))
a1 <- CJ.dt(DT1, DT2, DT3)
setorderv(a1, names(a1))
a2 <- CJ.dt_1(DT1, DT2, DT3)
setorderv(a2, names(a2))
a3 <- CJ.dt_2(DT1, DT2, DT3)
setorderv(a3, names(a3))
a4 <- CJDT(DT1, DT2, DT3)
setorderv(a4, names(a4))
identical(a1, a2)
identical(a1, a3)
identical(a1, a4)
bench::mark(CJ.dt(DT1, DT2, DT3),
CJ.dt_1(DT1, DT2, DT3),
CJ.dt_2(DT1, DT2, DT3),
CJDT(DT1, DT2, DT3),
check=FALSE)
timings:
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 CJ.dt(DT1, DT2, DT3) 1.52m 1.52m 0.0109 1.61GB 0.0328 1 3 1.52m <df[,6] [27,000,000 x 6]> <df[,3] [341 x 3]> <bch:tm> <tibble [1 x 3]>
2 CJ.dt_1(DT1, DT2, DT3) 1.53s 1.53s 0.653 1.41GB 1.31 1 2 1.53s <df[,6] [27,000,000 x 6]> <df[,3] [1,467 x 3]> <bch:tm> <tibble [1 x 3]>
3 CJ.dt_2(DT1, DT2, DT3) 1.27s 1.27s 0.787 2.35GB 0.787 1 1 1.27s <df[,6] [27,000,000 x 6]> <df[,3] [23,236 x 3]> <bch:tm> <tibble [1 x 3]>
4 CJDT(DT1, DT2, DT3) 1.07s 1.07s 0.930 929.44MB 0.930 1 1 1.07s <df[,6] [27,000,000 x 6]> <df[,3] [28 x 3]> <bch:tm> <tibble [1 x 3]>
Another test:
DT_A <- data.table(A=1:8e3)
DT_B <- data.table(B=1:8e3)
bench::mark(CJ.dt(DT_A, DT_B),
CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B), check=FALSE)
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 CJ.dt(DT_A, DT_B) 1.32m 1.32m 0.0126 1.53GB 0.0126 1 1 1.32m <df[,2] [64,000,000 x 2]> <df[,3] [217 x 3]> <bch:tm> <tibble [1 x 3]>
2 CJ.dt_1(DT_A, DT_B) 2.77s 2.77s 0.362 1.43GB 0.362 1 1 2.77s <df[,2] [64,000,000 x 2]> <df[,3] [19 x 3]> <bch:tm> <tibble [1 x 3]>
3 CJ.dt_2(DT_A, DT_B) 919.28ms 919.28ms 1.09 1.67GB 0 1 0 919.28ms <df[,2] [64,000,000 x 2]> <df[,3] [38 x 3]> <bch:tm> <tibble [1 x 3]>
4 CJDT(DT_A, DT_B) 2.24s 2.24s 0.446 976.63MB 0.446 1 1 2.24s <df[,2] [64,000,000 x 2]> <df[,3] [19 x 3]> <bch:tm> <tibble [1 x 3]>
This is a low-overhead version built on top of #4370, not really tested
crossjoin = function(x, i) {
stopifnot(is.data.table(x), is.data.table(i))
ni = nrow(i)
nx = nrow(x)
## bmerge ans for cross join
ans = list(starts = rep.int(1L, ni), lens = rep.int(nx, ni))
## dtmerge ans for cross join
ans = list(xrows = vecseq(ans$starts, ans$lens, NULL), irows = seqexp(ans$lens))
out.i = .Call(CsubsetDT, i, ans$irows, colnamesInt(i, NULL))
out.x = .Call(CsubsetDT, x, ans$xrows, colnamesInt(x, NULL))
out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
setDT(out)
out
}
@chinsoon12 benchmark you presented suffers badly in terms of readability
expression min
<bch:expr> <bch:tm>
1 CJ.dt(DT1, DT2, DT3) 1.52m
2 CJ.dt_1(DT1, DT2, DT3) 1.53s
3 CJ.dt_2(DT1, DT2, DT3) 1.27s
4 CJDT(DT1, DT2, DT3) 1.07s
how many people will notice that first expression has timing in minutes while all remaning ones in seconds?
Are there practical uses cases for a cross join on more than 2 tables?
Above crossjoin
could be tweaked to apply Reduce
when producing irows
and xrows
, then call CsubsetDT
just single time for i
and x
, and not for every pair of tables.
I re-run benchmark
library(data.table)
nr <- 3e2
DT1 <- data.table(A1=1:nr, A1=1:nr)
DT2 <- data.table(B1=1:nr, B2=1:nr)
DT3 <- data.table(C1=1:nr, C2=1:nr)
#https://github.com/Rdatatable/data.table/pull/814#issuecomment-55807497
CJ.dt = function(...) {
rows = do.call(CJ, lapply(list(...), function(x) if(is.data.frame(x)) seq_len(nrow(x)) else seq_along(x)));
do.call(data.table, Map(function(x, y) x[y], list(...), rows))
}
#https://github.com/Rdatatable/data.table/issues/1717#issuecomment-355499952
CJ.dt_1 <- function(...) {
Reduce(f=function(x, y) cbind(x[rep(1:nrow(x), times=nrow(y)),], y[rep(1:nrow(y), each=nrow(x)),]),
x=list(...))
} #CJ.dft
CJ.dt_2 <- function(...) {
DTls <- list(...)
rows <- do.call(CJ, lapply(DTls, function(x) x[, seq_len(.N)]))
res <- DTls[[1L]][rows[[1L]]]
for (n in seq_along(DTls)[-1L])
res <- res[, c(.SD, DTls[[n]][rows[[n]]])]
res
}
#https://github.com/Rdatatable/data.table/issues/2343#issuecomment-328156867
CJDT <- function(...)
Reduce(function(DT1, DT2) cbind(DT1, DT2[rep(1:.N, each=nrow(DT1))]), list(...))
a1 <- CJ.dt(DT1, DT2, DT3)
setorderv(a1, names(a1))
a2 <- CJ.dt_1(DT1, DT2, DT3)
setorderv(a2, names(a2))
a3 <- CJ.dt_2(DT1, DT2, DT3)
setorderv(a3, names(a3))
a4 <- CJDT(DT1, DT2, DT3)
setorderv(a4, names(a4))
identical(a1, a2)
identical(a1, a3)
identical(a1, a4)
crossjoin = data.table:::crossjoin
a5 = crossjoin(crossjoin(DT1, DT2), DT3)
setcolorder(a5, c(5:6,3:4,1:2))
setorderv(a5, names(a5))
identical(a1, a5)
DT_A <- data.table(A=1:8e3)
DT_B <- data.table(B=1:8e3)
mark1 = function(..., check=NULL) {
l = as.list(substitute(list(...)))[-1L]
txt = vapply(l, deparse, "")
env = parent.frame()
sec = vapply(l, function(expr) system.time(eval(expr, env=env))[[3L]], 0.0)
data.frame(expr = txt, sec = sec)
}
cat("# run 1\n")
mark1(CJ.dt(DT_A, DT_B), CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B), crossjoin(DT_A, DT_B), check=FALSE)
cat("\n# run 2\n")
mark1(CJ.dt(DT_A, DT_B), CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B), crossjoin(DT_A, DT_B), check=FALSE)
timings
# run 1
expr sec
1 CJ.dt(DT_A, DT_B) 65.542
2 CJ.dt_1(DT_A, DT_B) 2.668
3 CJ.dt_2(DT_A, DT_B) 0.743
4 CJDT(DT_A, DT_B) 2.121
5 crossjoin(DT_A, DT_B) 0.459
# run 2
expr sec
1 CJ.dt(DT_A, DT_B) 65.424
2 CJ.dt_1(DT_A, DT_B) 2.664
3 CJ.dt_2(DT_A, DT_B) 0.723
4 CJDT(DT_A, DT_B) 2.121
5 crossjoin(DT_A, DT_B) 0.458
@jangorecki sorry my bad. have rerun with time units:
timing code:
library(data.table) #data.table 1.12.8 Win 10 R-3.6.1 x64 4 threads
nr <- 3e2
DT1 <- data.table(A1=1:nr, A1=1:nr)
DT2 <- data.table(B1=1:nr, B2=1:nr)
DT3 <- data.table(C1=1:nr, C2=1:nr)
#https://github.com/Rdatatable/data.table/pull/814#issuecomment-55807497
CJ.dt = function(...) {
rows = do.call(CJ, lapply(list(...), function(x) if(is.data.frame(x)) seq_len(nrow(x)) else seq_along(x)));
do.call(data.table, Map(function(x, y) x[y], list(...), rows))
}
#https://github.com/Rdatatable/data.table/issues/1717#issuecomment-355499952
CJ.dt_1 <- function(...) {
Reduce(f=function(x, y) cbind(x[rep(1:nrow(x), times=nrow(y)),], y[rep(1:nrow(y), each=nrow(x)),]),
x=list(...))
} #CJ.dft
CJ.dt_2 <- function(...) {
DTls <- list(...)
rows <- do.call(CJ, lapply(DTls, function(x) x[, seq_len(.N)]))
res <- DTls[[1L]][rows[[1L]]]
for (n in seq_along(DTls)[-1L])
res <- res[, c(.SD, DTls[[n]][rows[[n]]])]
res
}
#https://github.com/Rdatatable/data.table/issues/2343#issuecomment-328156867
CJDT <- function(...)
Reduce(function(DT1, DT2) cbind(DT1, DT2[rep(1:.N, each=nrow(DT1))]), list(...))
bench::mark(CJ.dt(DT1, DT2, DT3),
CJ.dt_1(DT1, DT2, DT3),
CJ.dt_2(DT1, DT2, DT3),
CJDT(DT1, DT2, DT3),
time_unit='s',
check=FALSE)
DT_A <- data.table(A=1:8e3)
DT_B <- data.table(B=1:8e3)
bench::mark(CJ.dt(DT_A, DT_B),
CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B),
time_unit='s',
check=FALSE)
timings in seconds:
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result
<bch:expr> <dbl> <dbl> <dbl> <bch:byt> <dbl> <int> <dbl> <dbl> <list>
1 CJ.dt(DT1, DT2, DT3) 100. 100. 0.00995 1.62GB 0.0398 1 4 100. <df[,~
2 CJ.dt_1(DT1, DT2, DT3) 1.18 1.18 0.845 1.41GB 1.69 1 2 1.18 <df[,~
3 CJ.dt_2(DT1, DT2, DT3) 0.797 0.797 1.26 2.31GB 1.26 1 1 0.797 <df[,~
4 CJDT(DT1, DT2, DT3) 1.03 1.03 0.973 929.44MB 0.973 1 1 1.03 <df[,~
Another test:
DT_A <- data.table(A=1:8e3)
DT_B <- data.table(B=1:8e3)
bench::mark(CJ.dt(DT_A, DT_B),
CJ.dt_1(DT_A, DT_B), CJ.dt_2(DT_A, DT_B), CJDT(DT_A, DT_B),
time_unit='s',
check=FALSE)
timings in seconds:
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <dbl> <dbl> <dbl> <bch:byt> <dbl> <int> <dbl> <dbl>
1 CJ.dt(DT_A, DT_B) 78.7 78.7 0.0127 1.53GB 0 1 0 78.7
2 CJ.dt_1(DT_A, DT_B) 2.10 2.10 0.477 1.43GB 0.477 1 1 2.10
3 CJ.dt_2(DT_A, DT_B) 0.782 0.782 1.28 1.67GB 1.28 1 1 0.782
4 CJDT(DT_A, DT_B) 1.67 1.67 0.598 976.91MB 0.598 1 1 1.67
Are there practical uses cases for a cross join on more than 2 tables?
mark1 = function(..., check=NULL) { l = as.list(substitute(list(...)))[-1L] txt = vapply(l, deparse, "") env = parent.frame() sec = vapply(l, function(expr) system.time(eval(expr, env=env))[[3L]], 0.0) data.frame(expr = txt, sec = sec) }
The user can use something like your mark1 function if there are more than 2 data.tables to cross join.
By practical I mean real life use cases :)
I have just missed this feature.
My opinion on this:
1- I think that this feature nicely fits into CJ
function in the sense that CJ could be expanded to support data.frame (or list) too. I find something like CJ(DT1, DT2, etc.)
very nice and natural. I don't really see the benefit of implementing this feature in a new function.
2- cross join could also be supported in [.data.table
function. X[Y, on=TRUE]
is nice but I suggest to also consider using the keyword .CROSSJOIN
in on=
, like X[Y, on=.CROSSJOIN]
. The latter one is likely more intuitive (specially for users not used to SQL) and I feel that it would be somehow consistent with the way the natural join looks like in data.table (X[Y, on=.NATURAL]
)
An important remark: The first point (implementation in CJ
allows to cross join more than two datasets at once while the second point is limited to only two.
You are welcome to review cross join functionality proposed in https://github.com/Rdatatable/data.table/pull/4370
There is not straight way to do cross join currently. User need to add a column with a constant value in both datasets and join on that column. This could be nicely addressed in a similar way as it is in SQL where you can use
SELECT ... FROM t1 JOIN t2 ON 1=1
, where1=1
is used to evaluate to TRUE for every row. Eventuallyallow.cartesian
could be set toTRUE
whenon=TRUE
detected so the use case would look like:And corresponding SQLite
Update https://stackoverflow.com/questions/25888706/r-data-table-cross-join-not-working when solved.