Closed Anirban166 closed 7 months ago
Added the remaining one (cross join) to my script:
library(data.table)
library(microbenchmark)
run_benchmarks <- function(rowCount, colCount, threadCount) {
setDTthreads(threadCount)
dt <- data.table(matrix(runif(rowCount * colCount), nrow = rowCount, ncol = colCount))
threadLabel <- ifelse(threadCount == 1, "thread", "threads")
cat(sprintf("\nRunning benchmarks with %d %s, %d rows, and %d columns:\n", getDTthreads(), threadLabel, rowCount, colCount))
benchmarks <- microbenchmark(
forder = setorder(dt, V1),
GForce_sum = dt[, .(sum(V1))],
subsetting = dt[dt[[1]] > 0.5, ],
frollmean = frollmean(dt[[1]], 10),
fcoalesce = fcoalesce(dt[[1]], dt[[2]]),
fifelse = fifelse(dt[[1]] > 0.5, dt[[1]], 0),
between = dt[dt[[1]] %between% c(0.4, 0.6)],
nafill = nafill(dt[[1]], type = "const", fill = 0),
CJ = CJ(sample(rowCount, size = min(rowCount, 5)), sample(colCount, size = min(colCount, 5))),
times = 10
)
benchmark_summary <- summary(benchmarks)
meanTime <- benchmark_summary$mean
names(meanTime) <- benchmark_summary$expr
print(meanTime)
}
for(threadCount in c(1, 2)) {
run_benchmarks(1000000, 10, threadCount)
run_benchmarks(10, 1000000, threadCount)
}
> source("script.r")
Running benchmarks with 1 thread, 1000000 rows, and 10 columns:
forder GForce_sum subsetting frollmean fcoalesce fifelse between
43531.4184 2456.6052 10675.1655 4447.9154 1287.4682 2910.9895 5561.7506
nafill CJ
933.5532 688.1890
Running benchmarks with 1 thread, 10 rows, and 1000000 columns:
forder GForce_sum subsetting frollmean fcoalesce fifelse
72665.2754 76368.2697 1492410.9045 77.5700 42.7188 38.0600
between nafill CJ
1399204.1769 55.4876 910.6031
Running benchmarks with 2 threads, 1000000 rows, and 10 columns:
forder GForce_sum subsetting frollmean fcoalesce fifelse between
40045.5511 2851.9353 7059.6484 4389.8029 7384.8115 3230.8931 6061.1689
nafill CJ
967.9503 13262.2634
Running benchmarks with 2 threads, 10 rows, and 1000000 columns:
forder GForce_sum subsetting frollmean fcoalesce fifelse
74896.4874 86001.0177 1400344.0136 102.1589 36.1362 40.1211
between nafill CJ
1411831.0238 76.1031 1367.5597
Based on the results I'm observing, it seems that better speedups can be expected when the input data has more number of:
forder()
, GForce functions (such as the mean), subset()
, between()
(also fread()
and fwrite()
- not tested here since it's already done)frollmean()
, fcoalesce()
, fifelse()
, nafill()
, CJ()
Since the test code I wrote for fifelse
and subset
were based on row conditions and more catered towards row-intensive operations, I tried some column-intensive ops.
fifelse
was still significantly faster for a large number of rows vs a large number of columns.
subset
though, produced near about the same results in both cases, although having more number of rows was still slightly faster than having more number of columns. For reference:
library(data.table)
library(microbenchmark)
run_benchmarks <- function(rowCount, colCount, threadCount) {
setDTthreads(threadCount)
dt <- data.table(matrix(runif(rowCount * colCount), nrow = rowCount, ncol = colCount))
threadLabel <- ifelse(threadCount == 1, "thread", "threads")
cat(sprintf("\nRunning benchmarks with %d %s, %d rows, and %d columns:\n", getDTthreads(), threadLabel, rowCount, colCount))
benchmarks <- microbenchmark(
forder = setorder(dt, V1),
GForce_sum = dt[, .(sum(V1))],
subsetting = dt[dt[[1]] > 0.5, ],
frollmean = frollmean(dt[[1]], 10),
fcoalesce = fcoalesce(dt[[1]], dt[[2]]),
fifelse = fifelse(dt[[1]] > 0.5, dt[[1]], 0),
between = dt[dt[[1]] %between% c(0.4, 0.6)],
nafill = nafill(dt[[1]], type = "const", fill = 0),
subsetting_column_intensive = dt[, .SD, .SDcols = 1:min(1000, colCount)],
CJ = CJ(sample(rowCount, size = min(rowCount, 5)), sample(colCount, size = min(colCount, 5))),
times = 10
)
benchmark_summary <- summary(benchmarks)
meanTime <- benchmark_summary$mean
names(meanTime) <- benchmark_summary$expr
print(meanTime)
}
for(threadCount in c(1, 2)) {
run_benchmarks(1000000, 10, threadCount)
run_benchmarks(10, 1000000, threadCount)
}
> source("script.r")
Running benchmarks with 1 thread, 1000000 rows, and 10 columns:
forder GForce_sum
46112.2772 2625.1087
subsetting frollmean
12797.6722 5135.8454
fcoalesce fifelse
1863.6185 4253.7439
between nafill
14006.8108 1623.8215
subsetting_column_intensive CJ
12330.3476 830.7561
Running benchmarks with 1 thread, 10 rows, and 1000000 columns:
forder GForce_sum
68828.3094 71770.7628
subsetting frollmean
1544882.5921 99.4936
fcoalesce fifelse
29.5189 32.2261
between nafill
1585512.0459 68.5196
subsetting_column_intensive CJ
39634.0007 919.7032
Running benchmarks with 2 threads, 1000000 rows, and 10 columns:
forder GForce_sum
39905.245 3047.815
subsetting frollmean
6981.574 10511.764
fcoalesce fifelse
13535.083 2902.409
between nafill
5513.657 1069.926
subsetting_column_intensive CJ
32410.043 1396.480
Running benchmarks with 2 threads, 10 rows, and 1000000 columns:
forder GForce_sum
69553.1633 76141.1931
subsetting frollmean
1421410.7526 108.5046
fcoalesce fifelse
31.3867 50.6045
between nafill
1400176.1321 99.5726
subsetting_column_intensive CJ
38302.6997 770.0899
GitHub Action:
R script:
Notes:
getDTthreads()
always returned 2 whenthreadCount
was greater than 1)Output:
Output screenshot