Closed Anirban166 closed 5 months ago
library(ggplot2)
library(data.table)
library(microbenchmark)
run_benchmarks <- function(rowCount, colCount, threadCount) {
setDTthreads(threadCount)
dt <- data.table(matrix(runif(rowCount * colCount), nrow = rowCount, ncol = colCount))
threadLabel <- ifelse(threadCount == 1, "thread", "threads")
cat(sprintf("Running benchmarks with %d %s, %d rows, and %d columns.\n", getDTthreads(), threadLabel, rowCount, colCount))
benchmarks <- microbenchmark(
forder = setorder(dt, V1),
GForce_sum = dt[, .(sum(V1))],
subsetting = dt[dt[[1]] > 0.5, ],
frollmean = frollmean(dt[[1]], 10),
fcoalesce = fcoalesce(dt[[1]], dt[[2]]),
fifelse = fifelse(dt[[1]] > 0.5, dt[[1]], 0),
between = dt[dt[[1]] %between% c(0.4, 0.6)],
nafill = nafill(dt[[1]], type = "const", fill = 0),
subsetting_column_intensive = dt[, .SD, .SDcols = 1:min(1000, colCount)],
CJ = CJ(sample(rowCount, size = min(rowCount, 5)), sample(colCount, size = min(colCount, 5))),
times = 100
)
benchmark_summary <- summary(benchmarks)
meanTime <- benchmark_summary$mean
names(meanTime) <- benchmark_summary$expr
return(list(meanTime = meanTime, threadCount = threadCount))
}
find_optimal_threads <- function(rowCount, colCount) {
setDTthreads(0)
maxThreads <- getDTthreads()
results <- list()
for (threadCount in 1:maxThreads) {
results[[threadCount]] <- run_benchmarks(rowCount, colCount, threadCount)
}
return(results)
}
benchmarkData <- find_optimal_threads(1000, 10)
benchmark_df <- do.call(rbind, lapply(benchmarkData, as.data.frame))
# Extracting the function name without the numeric suffixes in a new column:
benchmark_df$expr <- gsub("\\d+$", "", rownames(benchmark_df))
rownames(benchmark_df) <- NULL
Just pasting another version from yesterday here which includes the mean runtime for each thread count for each function (towards calculating speedup - to be posted soon)