szilard / GBM-perf

Performance of various open source GBM implementations
MIT License
213 stars 28 forks source link

catboost ordered vs plain #16

Closed szilard closed 5 years ago

szilard commented 5 years ago

Screen Shot 2019-05-02 at 10 58 03 AM

Start R interactively in docker with all stuff installed:

docker run -it gbmperf_cpu taskset -c 0-15 R

suppressMessages({
library(data.table)
library(ROCR)
library(catboost)
})

set.seed(123)

d_train <- fread("train-0.1m.csv", showProgress=FALSE, stringsAsFactors=TRUE)
d_test <- fread("test.csv", showProgress=FALSE, stringsAsFactors=FALSE)   ## to match factors in train and test with bind

d_train_test <- rbind(d_train, d_test)
p <- ncol(d_train_test)-1

d_train_test$dep_delayed_15min <- ifelse(d_train_test$dep_delayed_15min=="Y",1,0)   ## need numeric y

d_train <- d_train_test[(1:nrow(d_train)),]
d_test <-  d_train_test[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test)),]

dx_train <- catboost.load_pool(d_train[,1:p], label = d_train$dep_delayed_15min)
dx_test  <- catboost.load_pool(d_test[,1:p])

params <- list(iterations = 100, depth = 10, learning_rate = 0.1,
   verbose = 0)
cat(system.time({
  md <- catboost.train(learn_pool = dx_train, test_pool = NULL, params = params)
})[[3]]," ",sep="")

phat <- catboost.predict(md, dx_test)
rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")

Run with defaults:

> params <- list(iterations = 100, depth = 10, learning_rate = 0.1,
+    verbose = 0)
> cat(system.time({
+   md <- catboost.train(learn_pool = dx_train, test_pool = NULL, params = params)
+ })[[3]]," ",sep="")
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
5.367 >
>
> phat <- catboost.predict(md, dx_test)
> rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.7225903

Run "plain":

>
> params <- list(iterations = 100, depth = 10, learning_rate = 0.1,
+    boosting_type = "Plain",
+    verbose = 0)
> cat(system.time({
+   md <- catboost.train(learn_pool = dx_train, test_pool = NULL, params = params)
+ })[[3]]," ",sep="")
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
5.231 >
>
> phat <- catboost.predict(md, dx_test)
> rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.7225903

Run "ordered":

> params <- list(iterations = 100, depth = 10, learning_rate = 0.1,
+    boosting_type = "Ordered",
+    verbose = 0)
> cat(system.time({
+   md <- catboost.train(learn_pool = dx_train, test_pool = NULL, params = params)
+ })[[3]]," ",sep="")
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
5.106 >
>
> phat <- catboost.predict(md, dx_test)
> rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.7193254

For 1M dataset:

> params <- list(iterations = 100, depth = 10, learning_rate = 0.1,
+    verbose = 0)
> cat(system.time({
+   md <- catboost.train(learn_pool = dx_train, test_pool = NULL, params = params)
+ })[[3]]," ",sep="")
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
50.586 >
>
> phat <- catboost.predict(md, dx_test)
> rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.7402029
>
>
>
> params <- list(iterations = 100, depth = 10, learning_rate = 0.1,
+    boosting_type = "Plain",
+    verbose = 0)
> cat(system.time({
+   md <- catboost.train(learn_pool = dx_train, test_pool = NULL, params = params)
+ })[[3]]," ",sep="")
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
50.202 >
>
> phat <- catboost.predict(md, dx_test)
> rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.7402029
>
>
>
>
> params <- list(iterations = 100, depth = 10, learning_rate = 0.1,
+    boosting_type = "Ordered",
+    verbose = 0)
> cat(system.time({
+   md <- catboost.train(learn_pool = dx_train, test_pool = NULL, params = params)
+ })[[3]]," ",sep="")
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
40.7 >
>
> phat <- catboost.predict(md, dx_test)
> rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.7335985