Random Forests - Githubissues

c5.9xlarge (18 cores, HT off):

1M:

Lightgbm:

suppressMessages({
library(data.table)
library(ROCR)
library(lightgbm)
library(Matrix)
})

set.seed(123)

d_train <- fread("train-1m.csv", showProgress=FALSE)
d_test <- fread("test.csv", showProgress=FALSE)

d_all <- rbind(d_train, d_test)
d_all$dep_delayed_15min <- ifelse(d_all$dep_delayed_15min=="Y",1,0)

d_all_wrules <- lgb.convert_with_rules(d_all)       
d_all <- d_all_wrules$data
cols_cats <- names(d_all_wrules$rules) 

d_train <- d_all[1:nrow(d_train)]
d_test <- d_all[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test))]

p <- ncol(d_all)-1
dlgb_train <- lgb.Dataset(data = as.matrix(d_train[,1:p]), label = d_train$dep_delayed_15min)

auc <- function() {
  phat <- predict(md, data = as.matrix(d_test[,1:p]))
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
  cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
}

system.time({
  md <- lgb.train(data = dlgb_train, 
            objective = "binary", 
            nrounds = 100, num_leaves = 512, learning_rate = 0.1, 
            categorical_feature = cols_cats,
            verbose = 2)
})
auc()

system.time({
  md <- lgb.train(data = dlgb_train, 
            objective = "binary", 
            nrounds = 100, max_depth = 10, num_leaves = 2**17, 
            boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
            categorical_feature = cols_cats,
            verbose = 2)
})
auc()

Results:

GBM:

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, num_leaves = 512, learning_rate = 0.1,
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000006 seconds, init for row-wise cost 0.004295 seconds
[LightGBM] [Debug] col-wise cost 0.006771 seconds, row-wise cost 0.000792 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 17
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 16
...
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 23
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 25
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 24
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 21
   user  system elapsed
 57.506   0.191   3.258
> auc()
0.7650181

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 10, num_leaves = 2**17, learning_rate = 0.1,
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000008 seconds, init for row-wise cost 0.004492 seconds
[LightGBM] [Debug] col-wise cost 0.007450 seconds, row-wise cost 0.000537 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 876 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 895 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 910 and max_depth = 10
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 650 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 745 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 672 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 759 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 649 and max_depth = 10
   user  system elapsed
 53.896   0.227   3.058
> auc()
0.7614953

RF:

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 10, num_leaves = 2**17,
+             boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000008 seconds, init for row-wise cost 0.004629 seconds
[LightGBM] [Debug] col-wise cost 0.001792 seconds, row-wise cost 0.000410 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Re-bagging, using 632548 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 405 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 631955 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 331 and max_depth = 10
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 677 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 632031 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 736 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 631444 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 377 and max_depth = 10
   user  system elapsed
 42.253   0.191   2.364
> auc()
0.7314994

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 15, num_leaves = 2**17,
+             boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000005 seconds, init for row-wise cost 0.004689 seconds
[LightGBM] [Debug] col-wise cost 0.001777 seconds, row-wise cost 0.000313 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Re-bagging, using 632548 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 933 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631955 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 635 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 632394 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 4024 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631446 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 767 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631800 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 3809 and max_depth = 15
...
[LightGBM] [Debug] Re-bagging, using 632325 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 3902 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 632031 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 4475 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631444 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 754 and max_depth = 15
   user  system elapsed
217.521   2.950  12.288
> auc()
0.7392125

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 20, num_leaves = 2**17,
+             boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000006 seconds, init for row-wise cost 0.004546 seconds
[LightGBM] [Debug] col-wise cost 0.001789 seconds, row-wise cost 0.000315 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Re-bagging, using 632548 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 960 and max_depth = 17
[LightGBM] [Debug] Re-bagging, using 631955 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 675 and max_depth = 18
[LightGBM] [Debug] Re-bagging, using 632394 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 5528 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 631446 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 842 and max_depth = 19
...
[LightGBM] [Debug] Re-bagging, using 632325 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 7083 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 632031 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 7377 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 631444 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 763 and max_depth = 17
   user  system elapsed
484.497   9.613  27.724
> auc()
0.7415699

GBM deep:

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 20, num_leaves = 2**17, learning_rate = 0.1,
+             categorical_feature = cols_cats,
+             verbose = 0)
+ })
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
    user   system  elapsed
1846.680   16.217  103.627
> auc()
0.7704145

xgboost:

suppressMessages({
library(data.table)
library(ROCR)
library(xgboost)
library(Matrix)
})

set.seed(123)

d_train <- fread("train-1m.csv", showProgress=FALSE)
d_test <- fread("test.csv", showProgress=FALSE)

X_train_test <- sparse.model.matrix(dep_delayed_15min ~ .-1, data = rbind(d_train, d_test))
n1 <- nrow(d_train)
n2 <- nrow(d_test)
X_train <- X_train_test[1:n1,]
X_test <- X_train_test[(n1+1):(n1+n2),]

dxgb_train <- xgb.DMatrix(data = X_train, label = ifelse(d_train$dep_delayed_15min=='Y',1,0))

auc <- function() {
  phat <- predict(md, newdata = X_test)
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
  cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
}

length(X_train@x)/nrow(X_train)

system.time({
  md <- xgb.train(data = dxgb_train, 
            objective = "binary:logistic", 
            nround = 100, max_depth = 10, eta = 0.1, 
            tree_method = "hist")
})
auc()

system.time({
  md <- xgb.train(data = dxgb_train, 
            objective = "binary:logistic", 
            nround = 1, num_parallel_tree = 100, max_depth = 10, 
            subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
            tree_method = "hist")
})
auc()

GBM:

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 100, max_depth = 10, eta = 0.1,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1314 extra nodes, 0 pruned nodes, max_depth=10
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1324 extra nodes, 0 pruned nodes, max_depth=10
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
...
[07:25:53] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:53] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 550 extra nodes, 0 pruned nodes, max_depth=10
[07:25:53] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:53] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 762 extra nodes, 0 pruned nodes, max_depth=10
   user  system elapsed
 63.519   0.056   3.833
> auc()
0.7478858

RF:

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 10,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:28:02] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:28:02] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:28:02] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 572 extra nodes, 0 pruned nodes, max_depth=10
[07:28:02] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1188 extra nodes, 0 pruned nodes, max_depth=10
[07:28:03] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 630 extra nodes, 0 pruned nodes, 
...
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 388 extra nodes, 0 pruned nodes, max_depth=10
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 482 extra nodes, 0 pruned nodes, max_depth=10
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 674 extra nodes, 0 pruned nodes, max_depth=10
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 766 extra nodes, 0 pruned nodes, max_depth=10
   user  system elapsed
 65.832   0.077   5.856
> auc()
0.730241

+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 15,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:29:39] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:29:39] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:29:39] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2856 extra nodes, 0 pruned nodes, max_depth=15
[07:29:40] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1232 extra nodes, 0 pruned nodes, max_depth=15
...
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3292 extra nodes, 0 pruned nodes, max_depth=15
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1312 extra nodes, 0 pruned nodes, max_depth=15
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2982 extra nodes, 0 pruned nodes, max_depth=15
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 6452 extra nodes, 0 pruned nodes, max_depth=15
   user  system elapsed
104.609   0.241   8.579
> auc()
0.7410314

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 20,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:30:24] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:30:24] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:30:24] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2326 extra nodes, 0 pruned nodes, max_depth=20
[07:30:24] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2184 extra nodes, 0 pruned nodes, max_depth=20
[07:30:25] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 10138 extra nodes, 0 pruned nodes, max_depth=20
...
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 13074 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2148 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2186 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2664 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3496 extra nodes, 0 pruned nodes, max_depth=20
   user  system elapsed
156.004   0.675  12.655
> auc()
0.7482527

xgboost with lambda=0 to better match lightgbm and build deeper trees (as per @laurae2)

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 10,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             lambda = 0,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:33:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:33:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:33:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 478 extra nodes, 0 pruned nodes, max_depth=10
[07:33:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 778 extra nodes, 0 pruned nodes, max_depth=10
[07:33:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 828 extra nodes, 0 pruned nodes, max_depth=10
...
[07:33:56] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1086 extra nodes, 0 pruned nodes, max_depth=10
[07:33:56] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 566 extra nodes, 0 pruned nodes, max_depth=10
[07:33:56] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 474 extra nodes, 0 pruned nodes, max_depth=10
   user  system elapsed
 70.681   0.007   6.126
> auc()
0.7305753

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 15,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             lambda = 0,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:35:11] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:35:11] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:35:11] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1824 extra nodes, 0 pruned nodes, max_depth=15
[07:35:12] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 7730 extra nodes, 0 pruned nodes, max_depth=15
...
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5070 extra nodes, 0 pruned nodes, max_depth=15
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 6534 extra nodes, 0 pruned nodes, max_depth=15
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3456 extra nodes, 0 pruned nodes, max_depth=15
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5598 extra nodes, 0 pruned nodes, max_depth=15
   user  system elapsed
126.601   0.104   9.989
> auc()
0.7406097

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 20,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             lambda = 0,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:36:17] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:36:17] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:36:17] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 15550 extra nodes, 0 pruned nodes, max_depth=20
[07:36:18] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 4658 extra nodes, 0 pruned nodes, max_depth=20
[07:36:18] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3644 extra nodes, 0 pruned nodes, max_depth=20
...
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5448 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5550 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 9408 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 13178 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 4156 extra nodes, 0 pruned nodes, max_depth=20
   user  system elapsed
215.592   0.725  16.913
> auc()
0.7503208

Summary:

1M:

Tool	Depth	Time [s]	AUC
lightgbm	10	2.3	0.7315
lightgbm	15	12.3	0.7392
lightgbm	20	27	0.7416
xgboost	10	5.8	0.7302
xgboost	15	8.6	0.7410
xgboost	20	12	0.7482
xgboost l=0	10	6.1	0.7306
xgboost l=0	15	10	0.7406
xgboost l=0	20	17	0.7503

h2o:

library(h2o)

h2o.init()

dx_train <- h2o.importFile("train-1m.csv")
dx_test <- h2o.importFile("test.csv")

Xnames <- names(dx_train)[which(names(dx_train)!="dep_delayed_15min")]

system.time({
  md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train, 
          ntrees = 100, max_depth = 10, 
          nbins = 100)
})
cat(h2o.auc(h2o.performance(md, dx_test)),"\n")

Results:

> system.time({
+   md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train,
+           ntrees = 100, max_depth = 10,
+           nbins = 100)
+ })
  |======================================================================| 100%
   user  system elapsed
  0.168   0.004   9.215
> cat(h2o.auc(h2o.performance(md, dx_test)),"\n")
0.7372074

> system.time({
+   md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train,
+           ntrees = 100, max_depth = 15,
+           nbins = 100)
+ })
  |======================================================================| 100%
   user  system elapsed
  0.279   0.007  33.379
> cat(h2o.auc(h2o.performance(md, dx_test)),"\n")
0.7499753

> system.time({
+   md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train,
+           ntrees = 100, max_depth = 20,
+           nbins = 100)
+ })
  |======================================================================| 100%
   user  system elapsed
  0.648   0.048 110.038
> cat(h2o.auc(h2o.performance(md, dx_test)),"\n")
0.7543568

Rborist:

library(data.table)
library(ROCR)
library(Matrix)
library(Rborist)

set.seed(123)

d_train <- fread("train-1m.csv")
d_test <- fread("test.csv")

X_train_test <- sparse.model.matrix(dep_delayed_15min ~ .-1, data = rbind(d_train, d_test))
X_train <- X_train_test[1:nrow(d_train),]
X_test <- X_train_test[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test)),]

auc <- function() {
  phat <- predict(md, newdata = X_test, ctgCensus="prob")$prob[,"Y"]
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min == "Y")
  performance(rocr_pred, "auc")@y.values[[1]]
}

system.time({
    md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=10, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
})
auc()

Results:

> system.time({
+     md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=10, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
+ })
   user  system elapsed
240.358   9.202  25.243
> auc()
[1] 0.7198579

> system.time({
+     md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=15, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
+ })
   user  system elapsed
417.049   8.086  35.107
> auc()
[1] 0.7309561

> system.time({
+     md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=20, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
+ })
   user  system elapsed
716.884   7.195  62.628
> auc()
[1] 0.7433575

ranger:

library(data.table)
library(ranger)
library(ROCR)

d_train <- fread("train-1m.csv")
d_test <- fread("test.csv")

d_train$dep_delayed_15min <- as.factor(d_train$dep_delayed_15min)
d_test$dep_delayed_15min  <- as.factor(d_test$dep_delayed_15min)

auc <- function() {
  phat <- predictions(predict(md, data = d_test))[,"Y"]
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
  performance(rocr_pred, "auc")@y.values[[1]]
}

system.time({
  md <- ranger(dep_delayed_15min ~ ., d_train, 
          num.trees = 100, max.depth = 10, probability = TRUE, write.forest = TRUE)
})
auc()

Results:

> system.time({
+   md <- ranger(dep_delayed_15min ~ ., d_train,
+           num.trees = 100, max.depth = 10, probability = TRUE, write.forest = TRUE)
+ })
   user  system elapsed
143.398   0.024  10.850
> auc()
[1] 0.7116554
>
>
> system.time({
+   md <- ranger(dep_delayed_15min ~ ., d_train,
+           num.trees = 100, max.depth = 15, probability = TRUE, write.forest = TRUE)
+ })
   user  system elapsed
216.044   0.080  16.971
> auc()
[1] 0.7191445
>
> system.time({
+   md <- ranger(dep_delayed_15min ~ ., d_train,
+           num.trees = 100, max.depth = 20, probability = TRUE, write.forest = TRUE)
+ })
   user  system elapsed
295.522   0.516  24.133
> auc()
[1] 0.72058

So far (1M rows, c5.9xlarge, 18 cores, HT off):

Time [sec]:

Tool	depth=10	depth=15	depth=20
xgboost	5.8	8.6	12
xgboost lamda=0	6.1	10	17
ranger	11	17	24
lightgbm	2.3	12	27
Rborist	25	35	62
h2o	9.2	33	110

sklearn RF:


import pandas as pd
import numpy as np
from sklearn import preprocessing 
from scipy import sparse
from sklearn import metrics, ensemble

d_train = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/train-1m.csv")
d_test = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/test.csv")

d_all = pd.concat([d_train,d_test])

vars_cat = ["Month","DayofMonth","DayOfWeek","UniqueCarrier", "Origin", "Dest"]
vars_num = ["DepTime","Distance"]
for col in vars_cat:
  d_all[col] = preprocessing.LabelEncoder().fit_transform(d_all[col])

X_all_cat = preprocessing.OneHotEncoder(categories="auto").fit_transform(d_all[vars_cat])   
X_all = sparse.hstack((X_all_cat, d_all[vars_num])).tocsr()                               
y_all = np.where(d_all["dep_delayed_15min"]=="Y",1,0)         

X_train = X_all[0:d_train.shape[0],]
y_train = y_all[0:d_train.shape[0]]
X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),]
y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])]

md = ensemble.RandomForestClassifier(max_depth = 10, n_estimators = 100, n_jobs = -1)
%time md.fit(X_train, y_train)

y_pred = md.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred))

Results:

md = ensemble.RandomForestClassifier(max_depth = 10, n_estimators = 100, n_jobs = -1)
Wall time: 9.79 s
0.703149121562214

md = ensemble.RandomForestClassifier(max_depth = 15, n_estimators = 100, n_jobs = -1)
Wall time: 20.6 s
0.7085553315997604

md = ensemble.RandomForestClassifier(max_depth = 20, n_estimators = 100, n_jobs = -1)
Wall time: 41.8 s
0.7144237796242365

So far (1M rows, c5.9xlarge, 18 cores, HT off):

Time [sec]:

Tool	depth=10	depth=15	depth=20
xgboost	5.8	8.6	12
xgboost lamda=0	6.1	10	17
ranger	11	17	24
lightgbm	2.3	12	27
sklearn	10	21	42
Rborist	25	35	62
h2o	9.2	33	110

Rforestry via @laurae2

library(Rforestry)
library(data.table)
library(ROCR)

d_train <- fread("https://s3.amazonaws.com/benchm-ml--main/train-1m.csv", stringsAsFactors=TRUE)
d_test_char <- fread("https://s3.amazonaws.com/benchm-ml--main/test.csv")
p <- 8

d_all <- rbind(d_train, d_test_char)
d_test <- d_all[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test_char))]

system.time({
md <- forestry(x = d_train[,1:p], y = d_train$dep_delayed_15min, ntree = 100, maxDepth = 10)
})

phat <- predict(md, d_test[,1:p]) 
rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")

Run (1M rows, depth=10, c5.9xlarge, 18 cores, HT off):

   user  system elapsed
654.304   8.341  38.285
>

> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.719672

szilard / GBM-perf

Random Forests #44