Laurae2 / ml-perf

MIT License
4 stars 0 forks source link

LightGBM pre-performance #5

Open Laurae2 opened 5 years ago

Laurae2 commented 5 years ago

I am currently testing a LightGBM script which can run 40 LightGBM on one 4GB GPU (only 2690 MB used).

Throughput: 1.330s / model, insane speed! Imagine on multiple GPUs...

image

You need 67.25 MB per thread for LightGBM on airline 0.1m data. It's about 10x lower than xgboost. Maybe @RAMitchell can get some ideas to optimize xgboost for very sparse data on GPU, as it could be a common case (xgboost does not support categoricals). LightGBM handles 16 bins, 64 bins, and 256 bins as shown here: https://github.com/microsoft/LightGBM/tree/master/src/treelearner/ocl

For the script, you may have to toy with gpu_platform_id and gpu_device_id. Unfortunately, it's due to OpenCL way of working with devices... (it will crash if it is wrong platform/device combo).

Test script:

# Sets OpenMP to 1 thread by default
Sys.setenv(OMP_NUM_THREADS = 1)

suppressMessages({
  library(optparse)
  library(data.table)
  library(parallel)
  library(lightgbm)
  library(Matrix)
})

args_list <- list( 
  optparse::make_option("--parallel_threads", type = "integer", default = 1, metavar = "Parallel CPU Threads",
                        help="Number of threads for parallel training for CPU (automatically changed if using GPU), should be greater than or equal to parallel_gpus * gpus_threads [default: %default]"),
  optparse::make_option("--model_threads", type = "integer", default = 1, metavar = "Model CPU Threads",
                        help = "Number of threads for training a single model, total number of threads is parallel_threads * model_threads [default: %default]"),
  optparse::make_option("--parallel_gpus", type = "integer", default = 0, metavar = "Parallel GPU Threads",
                        help = "Number of GPUs to use for parallel training, use 0 for no GPU [default: %default]"),
  optparse::make_option("--gpus_threads", type = "integer", default = 0, metavar = "Model GPU Threads",
                        help = "Number of parallel models to train per GPU (uses linearly more RAM), use 0 for no GPU [default: %default]"),
  optparse::make_option("--number_of_models", type = "integer", default = 1, metavar = "Number of Models",
                        help = "Number of models to train in total [default: %default]"),
  optparse::make_option("--wkdir", type = "character", default = "", metavar = "Working Directory",
                        help = "The working directory, do NOT forget it! [default: \"%default\"]"),
  optparse::make_option("--train_file", type = "character", default = "", metavar = "Training File",
                        help = "The training file to use relative to the working directory (or an absolute path), do NOT forget it! [default: \"%default\"]"),
  optparse::make_option("--test_file", type = "character", default = "", metavar = "Testing file",
                        help = "The testing file to use relative to the working directory (or an absolute path), do NOT forget it! [default: \"%default\"]"),
  optparse::make_option("--output_dir", type = "character", default = "", metavar = "Output Directory",
                        help = "The output directory for files (or an absolute path), do NOT forget it! [default: \"%default\"]"),
  optparse::make_option("--output_csv", type = "logical", default = TRUE, metavar = "Output CSV File",
                        help = "Outputs results as a CSV file [default: %default]"),
  optparse::make_option("--output_chart", type = "character", default = "jpeg", metavar = "Plot File Format",
                        help = "Outputs results as a chart using the desired format, can be any of: \"none\" (for no chart), \"eps\", \"ps\", \"tex\" (pictex), \"pdf\", \"jpeg\", \"tiff\", \"png\", \"bmp\", \"svg\", \"wmf\" (Windows only) [default: \"%default\"]"),
  optparse::make_option("--args", type = "logical", default = FALSE, metavar = "Argument Check",
                        help = "Prints the arguments passed to the R script and exits immediately [default: %default]")
)

# Force data.table as 1 thread in case you are using Fork instead of Sockets (gcc: fork X in process Y when process Y used OpenMP once, fork X cannot use OpenMP otherwise it hangs forever)
data.table::setDTthreads(1)

if (interactive()) {

  # Put some parameters if you wish to test once...
  my_gpus <- 1L
  my_gpus_threads <- 40L
  my_threads <- 1L # parallel::detectCores() - 1L
  my_threads_in_threads <- 1L
  my_runs <- 40L
  my_train <- "train-0.1m.csv"
  my_test <- "test.csv"
  my_output <- "./output"
  my_csv <- TRUE
  my_chart <- "jpeg"
  # my_cpu <- system("lscpu | sed -nr '/Model name/ s/.*:\\s*(.*) @ .*/\\1/p' | sed ':a;s/  / /;ta'")

  # CHANGE: 0.1M = GPU about 958 MB at peak... choose wisely (here, we are putting 4 models per GPU)
  if (my_gpus > 0L) {
    # my_threads <- min(my_gpus * my_gpus_threads, my_threads)
    my_threads <- my_gpus * my_gpus_threads
  }

} else {

  # Old school method... obsolete
  # DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  # Rscript bench_file.R 1 1 0 0 25 ${DIR} ../train-0.1m.csv ../test.csv
  # args <- commandArgs(trailingOnly = TRUE)
  # 
  # setwd(args[6])
  # my_gpus <- args[3]
  # my_gpus_threads <- args[4]
  # my_threads <- args[1]
  # my_threads_in_threads <- args[2]
  # my_runs <- args[5]
  # my_train <- args[7]
  # my_test <- args[8]

  # DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  # Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=25 --wkdir=${DIR} --train_file=../train-0.1m.csv --test_file=../test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg --args=TRUE
  # Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=25 --wkdir=${DIR} --train_file=../train-0.1m.csv --test_file=../test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
  args <- optparse::parse_args(optparse::OptionParser(option_list = args_list))
  setwd(args$wkdir)
  my_gpus <- args$parallel_gpus
  my_gpus_threads <- args$gpus_threads
  my_threads <- args$parallel_threads
  my_threads_in_threads <- args$model_threads
  my_runs <- args$number_of_models
  my_train <- args$train_file
  my_test <- args$test_file
  my_output <- args$output_dir
  my_csv <- args$output_csv
  my_chart <- args$output_chart

  if (my_gpus > 0L) {
    # my_threads <- min(my_gpus * my_gpus_threads, my_threads)
    my_threads <- my_gpus * my_gpus_threads
    args$parallel_threads <- my_threads
  }

  if (args$args) {
    print(args)
    stop("\rArgument check done.")
  }

}

# Load data and do preprocessing

cat("[", format(Sys.time(), "%a %b %d %Y %X"), "] [Data] Loading data.\n", sep = "")
d_train <- fread(my_train, showProgress = FALSE)
d_test <- fread(my_test, showProgress = FALSE)
invisible(gc(verbose = FALSE))

cat("[", format(Sys.time(), "%a %b %d %Y %X"), "] [Data] Transforming data.\n", sep = "")
X_train_test <- sparse.model.matrix(dep_delayed_15min ~ . -1, data = rbindlist(list(d_train, d_test))); invisible(gc(verbose = FALSE))
n1 <- nrow(d_train)
n2 <- nrow(d_test)
X_train <- X_train_test[1L:n1,]; invisible(gc(verbose = FALSE))
X_test <- X_train_test[(n1 + 1L):(n1 + n2),]; invisible(gc(verbose = FALSE))

labels_train <- as.numeric(d_train$dep_delayed_15min == "Y")
labels_test <- as.numeric(d_test$dep_delayed_15min == "Y")

rm(d_train, d_test, X_train_test, n1, n2); invisible(gc(verbose = FALSE))

# CHANGE: metric function
metric <- function(preds, labels) {
  x1 <- as.numeric(preds[labels == 1])
  n1 <- as.numeric(length(x1))
  x2 <- as.numeric(preds[labels == 0])
  n2 <- as.numeric(length(x2))
  r <- rank(c(x1,x2))
  return((sum(r[1:n1]) - n1 * (n1 + 1) / 2) / (n1 * n2))
}

# CHANGE: trainer function
trainer <- function(x, row_sampling, col_sampling, max_depth, n_iter, learning_rate, nbins, nthread, n_gpus, gpu_choice, objective) {

  if (n_gpus > 0) {

    matrix_train_time <- system.time({
      dlgb_train <- lgb.Dataset(data = X_train, label = labels_train, nthread = nthread, device = "gpu")
      lgb.Dataset.construct(dlgb_train)
    })[[3]]
    matrix_test_time <- system.time({
      dlgb_test <- lgb.Dataset(data = X_test, label = labels_test, nthread = nthread, device = "gpu") # Completely useless in practice
      lgb.Dataset.construct(dlgb_test) # Completely useless in practice
    })[[3]]

  } else {

    matrix_train_time <- system.time({
      dlgb_train <- lgb.Dataset(data = X_train, label = labels_train, nthread = nthread)
      lgb.Dataset.construct(dlgb_train)
    })[[3]]
    matrix_test_time <- system.time({
      dlgb_test <- lgb.Dataset(data = X_test, label = labels_test, nthread = nthread) # Completely useless in practice
      lgb.Dataset.construct(dlgb_test) # Completely useless in practice
    })[[3]]

  }

  if (n_gpus > 0) {

    model_time <- system.time({
      set.seed(x) # Useless
      model_train <- lightgbm::lgb.train(data = dlgb_train, 
                                         objective = objective,
                                         nrounds = n_iter,
                                         num_leaves = (2 ^ max_depth) - 1,
                                         max_depth = max_depth,
                                         learning_rate = learning_rate,
                                         bagging_fraction = row_sampling,
                                         bagging_seed = x,
                                         bagging_freq = 1,
                                         feature_fraction = col_sampling,
                                         feature_fraction_seed = x,
                                         num_threads = nthread,
                                         device = "gpu",
                                         gpu_platform_id = 0,
                                         gpu_device_id = gpu_choice,
                                         max_bin = nbins,
                                         verbose = -1)
    })[[3]]

  } else {

    model_time <- system.time({
      set.seed(x)
      model_train <- lightgbm::lgb.train(data = dlgb_train, 
                                         objective = objective,
                                         nrounds = n_iter,
                                         num_leaves = (2 ^ max_depth) - 1,
                                         max_depth = max_depth,
                                         learning_rate = learning_rate,
                                         bagging_fraction = row_sampling,
                                         bagging_seed = x,
                                         bagging_freq = 1,
                                         feature_fraction = col_sampling,
                                         feature_fraction_seed = x,
                                         num_threads = nthread,
                                         device = "cpu",
                                         max_bin = nbins,
                                         verbose = -1)
    })[[3]]

  }

  pred_time <- system.time({
    model_predictions <- predict(model_train, data = X_test)
  })[[3]]

  perf <- metric(preds = model_predictions, labels = labels_test)

  rm(model_train, model_predictions, dlgb_train, dlgb_test)

  gc_time <- system.time({
    invisible(gc(verbose = FALSE))
  })[[3]]

  return(list(matrix_train_time = matrix_train_time, matrix_test_time = matrix_test_time, model_time = model_time, pred_time = pred_time, gc_time = gc_time, perf = perf))

}

# Parallel Section

cat("[", format(Sys.time(), "%a %b %d %Y %X"), "]", " [Parallel] ", my_threads, " Process(es) Creation Time: ", sprintf("%04.03f", system.time({cl <- makeCluster(my_threads)})[[3]]), "s\n", sep = "")
cat("[", format(Sys.time(), "%a %b %d %Y %X"), "]", " [Parallel] Sending Hardware Specifications Time: ", sprintf("%04.03f", system.time({clusterExport(cl = cl, c("my_threads", "my_gpus", "my_threads_in_threads"))})[[3]]), "s\n", sep = "")
invisible(parallel::parLapply(cl = cl, X = seq_len(my_threads), function(x) {
  Sys.sleep(time = my_threads / 20) # Prevent file clash on many core systems (typically 50+ threads might attempt to read exactly at the same time the same file, especially if the disk is slow)
  suppressPackageStartupMessages(library(lightgbm))
  suppressPackageStartupMessages(library(Matrix))
  suppressPackageStartupMessages(library(data.table))
  id <<- x
}))
cat("[", format(Sys.time(), "%a %b %d %Y %X"), "]", " [Parallel] Sending Data Time: ", sprintf("%04.03f", system.time({clusterExport(cl = cl, c("trainer", "metric", "X_train", "X_test", "labels_train", "labels_test", "my_threads"))})[[3]]), "s\n", sep = "")

# Having issues? In a CLI: sudo pkill R
time_finish <- system.time({
  time_all <- parallel::parLapplyLB(cl = cl, X = seq_len(my_runs), function(x) {

    if (my_gpus == 0L) {
      gpus_to_use <- 0
      gpus_allowed <- 0
    } else {
      gpus_to_use <- (id - 1) %% my_gpus
      gpus_allowed <- 1
    }

    speed_out <- system.time({
      speed_in <- trainer(x = x,
                          row_sampling = 0.9,
                          col_sampling = 0.9,
                          max_depth = 6,
                          n_iter = 500,
                          learning_rate = 0.05,
                          nbins = 255,
                          nthread = my_threads_in_threads,
                          n_gpus = gpus_allowed,
                          gpu_choice = gpus_to_use,
                          objective = "binary")
    })[[3]]

    rm(gpus_to_use)

    return(list(total = speed_out, matrix_train_time = speed_in$matrix_train_time, matrix_test_time = speed_in$matrix_test_time, model_time = speed_in$model_time, pred_time = speed_in$pred_time, gc_time = speed_in$gc_time, perf = speed_in$perf))

  })
})[[3]]

# Clearup all R sessions from this process, except the master
stopCluster(cl)
closeAllConnections()

rm(cl, metric, trainer, X_train, X_test, labels_train, labels_test); invisible(gc(verbose = FALSE))

cat("[", format(Sys.time(), "%a %b %d %Y %X"), "]", " [Parallel] Total Time: ", sprintf("%04.03f", time_finish), "s\n", sep = "")

# Gather Data

# Get data
time_total <- unlist(lapply(time_all, function(x) {round(x[[1]], digits = 3)}))
matrix_train_time <- unlist(lapply(time_all, function(x) {round(x[[2]], digits = 3)}))
matrix_test_time <- unlist(lapply(time_all, function(x) {round(x[[3]], digits = 3)}))
model_time <- unlist(lapply(time_all, function(x) {round(x[[4]], digits = 3)}))
pred_time <- unlist(lapply(time_all, function(x) {round(x[[5]], digits = 3)}))
gc_time <- unlist(lapply(time_all, function(x) {round(x[[6]], digits = 3)}))
perf <- unlist(lapply(time_all, function(x) {round(x[[7]], digits = 6)}))

# Put all data together
time_table <- data.table(Run = seq_len(my_runs),
                         time_total = time_total,
                         matrix_train_time = matrix_train_time,
                         matrix_test_time = matrix_test_time,
                         model_time = model_time,
                         pred_time = pred_time,
                         gc_time = gc_time,
                         perf = perf)

if (my_csv) {

  fwrite(time_table, paste0(my_output, "/ml-perf_lgb_gbdt_", substr(my_train, 1, nchar(my_train) - 4), "_", my_threads, "Tx", my_threads_in_threads, "T_", my_gpus, "GPU_", my_runs, "m_", sprintf("%04.03f", time_finish), "s.csv"))

}

# Analyze Data

if (my_chart != "none") {

  suppressMessages({
    library(ggplot2)
    library(ClusterR)
  })

  # Create time series matrix
  time_table_matrix <- apply(as.matrix(time_table[, 2:8, with = FALSE]), MARGIN = 2, function(x) {
    y <- cumsum(x)
    y / max(y)
  })

  # Compute optimal number of non-parametric clusters
  clusters <- Optimal_Clusters_Medoids(data = time_table_matrix,
                                       max_clusters = 2:10,
                                       distance_metric = "manhattan",
                                       criterion = "silhouette",
                                       threads = 1,
                                       swap_phase = TRUE,
                                       verbose = FALSE,
                                       plot_clusters = FALSE,
                                       seed = 1)

  # Compute clusters
  clusters_selected <- Cluster_Medoids(data = time_table_matrix,
                                       clusters = 1 + which.max(unlist(lapply(clusters, function(x) {x[[3]]}))),
                                       distance_metric = "manhattan",
                                       threads = 1,
                                       swap_phase = TRUE,
                                       verbose = FALSE,
                                       seed = 1)
  time_table[, Cluster := as.character(clusters_selected$clusters)]

  # Melt data
  time_table_vertical <- melt(time_table, id.vars = c("Run", "Cluster"), measure.vars = c("time_total", "matrix_train_time", "matrix_test_time", "model_time", "pred_time", "gc_time", "perf"), variable.name = "Variable", value.name = "Value", variable.factor = FALSE, value.factor = FALSE)

  # Rename melted variables to have details in chart
  time_table_vertical[Variable == "time_total", Variable := paste0("1. Total Time (Σ=", sprintf("%04.03f", sum(Value)), ", μ=", sprintf("%04.03f", mean(Value)), ", σ=", sprintf("%04.03f", sd(Value)), ")")]
  time_table_vertical[Variable == "model_time", Variable := paste0("2. Model Time (Σ=", sprintf("%04.03f", sum(Value)), ", μ=", sprintf("%04.03f", mean(Value)), ", σ=", sprintf("%04.03f", sd(Value)), ")")]
  time_table_vertical[Variable == "matrix_train_time", Variable := paste0("3. Matrix Train Build Time (Σ=", sprintf("%04.03f", sum(Value)), ", μ=", sprintf("%04.03f", mean(Value)), ", σ=", sprintf("%04.03f", sd(Value)), ")")]
  time_table_vertical[Variable == "matrix_test_time", Variable := paste0("4. Matrix Test Build Time (Σ=", sprintf("%04.03f", sum(Value)), ", μ=", sprintf("%04.03f", mean(Value)), ", σ=", sprintf("%04.03f", sd(Value)), ")")]
  time_table_vertical[Variable == "pred_time", Variable := paste0("5. Predict Time (Σ=", sprintf("%04.03f", sum(Value)), ", μ=", sprintf("%04.03f", mean(Value)), ", σ=", sprintf("%04.03f", sd(Value)), ")")]
  time_table_vertical[Variable == "gc_time", Variable := paste0("6. Garbage Collector Time (Σ=", sprintf("%04.03f", sum(Value)), ", μ=", sprintf("%04.03f", mean(Value)), ", σ=", sprintf("%04.03f", sd(Value)), ")")]
  time_table_vertical[Variable == "perf", Variable := paste0("7. Metric (Σ=", sprintf("%07.06f", sum(Value)), ", μ=", sprintf("%07.06f", mean(Value)), ", σ=", sprintf("%07.06f", sd(Value)), ")")]
  cat(sort(unique(time_table_vertical$Variable)), sep = "\n")

  # Plot a nice chart
  my_plot <- ggplot(data = time_table_vertical, aes(x = Run, y = Value, group = Cluster, color = Cluster)) + geom_point() + facet_wrap(facets = Variable ~ ., nrow = 4, ncol = 2, scales = "free_y") + labs(title = "'Performance' over Models, LightGBM GBDT", subtitle = paste0(my_runs, " Models over ", sprintf("%04.03f", time_finish), " seconds using ", my_threads, " parallel threads, ", my_threads_in_threads, " model threads, and ", my_gpus, " GPUs (Throughput: ", sprintf("%04.03f", time_finish / my_runs), "s / Model", ")"), x = "Model", y = "Value or Time (s)") + theme_bw() + theme(legend.position = "none")
  ggsave(filename = paste0(my_output, "/ml-perf_lgb_gbdt_", substr(my_train, 1, nchar(my_train) - 4), "_", my_threads, "Tx", my_threads_in_threads, "T_", my_gpus, "GPU_", my_runs, "m_", sprintf("%04.03f", time_finish), "s.jpg"),
         plot = my_plot,
         device = my_chart,
         width = 24,
         height = 16,
         units = "cm",
         dpi = "print")

  if (interactive()) {
    print(my_plot)
  }

}

cat("[", format(Sys.time(), "%a %b %d %Y %X"), "] Done computations. Quitting R.\n", sep = "")
Laurae2 commented 5 years ago

Also LightGBM GPU RAM usage is very predictable once you used it once: no need to try guessing the peak RAM used, because the peak RAM used seems to be one that stays during the training.

xgboost GPU usage is less predictable, as the peak is (usually) very short and can't be always be caught using nvidia-smi or other monitoring tools, without fine grained logging (without going to <100ms). ping @RAMitchell about this in case you are interested.

Here is an example of extrapolating 67.25 MB RAM estimated peak usage for maxing out my 4GB GPU with processes, without crashing on 68 processes (with still small headroom for 1 more):

image

Laurae2 commented 5 years ago

Obviously, we cannot run 4x GPU with 68 R threads each (maxes out each GPU to nearly 4GB RAM) as R doesn't have enough available connections (limited to 125 at the moment), see the following: https://github.com/HenrikBengtsson/Wishlist-for-R/issues/28 - this could maximize the performance of LightGBM for the data. A solution to this would be to nest parallelization, however this might hit a limit once 1000 connections are reached.

The 125 connection limit also limits how this idea could be used, as I already also hit the limit on large servers (8x Intel Platinum 8180/8280, it has 224 cores / 448 threads!). Perhaps R will do something in the future about it (I do use nested parallelization workaround to scale R with 10,000+ threads on many machines using SSH key, however the initialization performance for such amount of servers is poor).

Running overnight this with LightGBM, and re-running xgboost overnight also:

CPU:

Run Parallel Threads Model Threads Parallel GPUs GPU Threads Models Seconds / Model Boost vs Baseline
20 1 1 0 0 100 ~1x
21 9 1 0 0 250 x
22 18 1 0 0 500 x
23 35 1 0 0 1000 x
24 70 1 0 0 2500 x
25 1 1 0 0 250 ~1x
26 1 9 0 0 250 x
27 1 18 0 0 250 x
28 1 35 0 0 250 x
29 1 70 0 0 250 x

GPU:

Run Parallel Threads Model Threads Parallel GPUs GPU Threads Models Seconds / Model Boost vs Baseline
1 1 1 1 1 50 ~1x
2 2 1 2 1 100 x
3 3 1 3 1 250 x
4 4 1 4 1 500 x
5 4 1 1 4 100 x
6 8 1 2 4 250 x
7 12 1 3 4 500 x
8 16 1 4 4 1000 x
9 9 1 1 9 250 x
10 18 1 2 9 500 x
11 27 1 3 9 1000 x
12 36 1 4 9 2500 x
13 18 1 1 18 500 x
14 36 1 2 18 1000 x
15 54 1 3 18 2500 x
16 72 1 4 18 5000 x
17 35 1 1 35 1000 x
18 35 1 2 35 2500 x
19 58 1 1 58 2500 x

Script:

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=1 --parallel_gpus=1 --gpus_threads=1 --number_of_models=50 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=2 --model_threads=1 --parallel_gpus=2 --gpus_threads=1 --number_of_models=100 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=3 --model_threads=1 --parallel_gpus=3 --gpus_threads=1 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=4 --model_threads=1 --parallel_gpus=4 --gpus_threads=1 --number_of_models=500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=4 --model_threads=1 --parallel_gpus=1 --gpus_threads=4 --number_of_models=100 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=8 --model_threads=1 --parallel_gpus=2 --gpus_threads=4 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=12 --model_threads=1 --parallel_gpus=3 --gpus_threads=4 --number_of_models=500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=16 --model_threads=1 --parallel_gpus=4 --gpus_threads=4 --number_of_models=1000 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=9 --model_threads=1 --parallel_gpus=1 --gpus_threads=9 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=18 --model_threads=1 --parallel_gpus=2 --gpus_threads=9 --number_of_models=500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=27 --model_threads=1 --parallel_gpus=3 --gpus_threads=9 --number_of_models=1000 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=36 --model_threads=1 --parallel_gpus=4 --gpus_threads=9 --number_of_models=2500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=18 --model_threads=1 --parallel_gpus=1 --gpus_threads=18 --number_of_models=500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=36 --model_threads=1 --parallel_gpus=2 --gpus_threads=18 --number_of_models=1000 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=54 --model_threads=1 --parallel_gpus=3 --gpus_threads=18 --number_of_models=2500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=72 --model_threads=1 --parallel_gpus=4 --gpus_threads=18 --number_of_models=5000 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=35 --model_threads=1 --parallel_gpus=1 --gpus_threads=35 --number_of_models=1000 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=70 --model_threads=1 --parallel_gpus=2 --gpus_threads=35 --number_of_models=2500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=58 --model_threads=1 --parallel_gpus=1 --gpus_threads=58 --number_of_models=2500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=100 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=9 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=18 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=35 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=1000 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=70 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=2500 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=1 --parallel_gpus=0 --gpus_threads=0 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=9 --parallel_gpus=0 --gpus_threads=0 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=18 --parallel_gpus=0 --gpus_threads=0 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=35 --parallel_gpus=0 --gpus_threads=0 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg
env -u GOMP_CPU_AFFINITY Rscript bench_lgb_test.R --parallel_threads=1 --model_threads=70 --parallel_gpus=0 --gpus_threads=0 --number_of_models=250 --wkdir=${DIR} --train_file=train-0.1m.csv --test_file=test.csv --output_dir=./output --output_csv=TRUE --output_chart=jpeg