mlr-org / mlr3

mlr3: Machine Learning in R - next generation
https://mlr3.mlr-org.com
GNU Lesser General Public License v3.0
929 stars 86 forks source link

Compare predictive performance of individual models and classification average in trained graph #784

Closed MislavSag closed 2 weeks ago

MislavSag commented 2 years ago

My goal is to compare predictive performance of individual models (say ranger, kknn and log_reg) with classif.average of all models.

I have created following graph with individual learners and classif average at the end:

library(mlr3verse)
task = tsk("german_credit")

# learners
learners = list(
  ranger = lrn("classif.ranger", predict_type = "prob", id = "ranger"),
  log_reg = lrn("classif.log_reg", predict_type = "prob", id = "log_reg"),
  kknn = lrn("classif.kknn", predict_type = "prob", id = "kknn")
)
# create complete grapg
graph = po("removeconstants", ratio = 0.05) %>>%
  po("branch", options = c("nop_prep", "yeojohnson", "pca", "ica"), id = "prep_branch") %>>%
  gunion(list(po("nop", id = "nop_prep"), po("yeojohnson"), po("pca", scale. = TRUE), po("ica"))) %>>%
  po("unbranch", id = "prep_unbranch") %>>%
  learners %>>% 
  po("classifavg", innum = length(learners))

plot(graph)
graph_learner = as_learner(graph)
as.data.table(graph_learner$param_set)[1:70, .(id, class, lower, upper)]
search_space = ps(
  # preprocesing
  # interaction_branch.selection = p_fct(levels = c("nop_filter", "modelmatrix")),
  prep_branch.selection = p_fct(levels = c("nop_prep", "yeojohnson", "pca", "ica")),
  pca.rank. = p_int(2, 6, depends = prep_branch.selection == "pca"),
  ica.n.comp = p_int(2, 6, depends = prep_branch.selection == "ica"),
  yeojohnson.standardize = p_lgl(depends = prep_branch.selection == "yeojohnson"),
  # models
  ranger.ranger.mtry.ratio = p_dbl(0.2, 1),
  ranger.ranger.max.depth = p_int(2, 6),
  kknn.kknn.k = p_int(5, 20)
)
# plan("multisession", workers = 4L)
at = auto_tuner(
  method = "random_search",
  learner = graph_learner,
  resampling = rsmp("cv", folds = 3),
  measure = msr("classif.acc"),
  search_space = search_space,
  term_evals = 15
)
at$train(task)

How can now compare predictive performance of average with preditive performance of indiviudla model?

I know I can train those model separately (inside graph or in new learner list), but than I am training the same model multiple times .

sebffischer commented 2 years ago

Sorry for the late response, does the following help?

library(mlr3verse)
#> Loading required package: mlr3
task = tsk("german_credit")

# learners
learners = list(
  ranger = lrn("classif.ranger", predict_type = "prob", id = "ranger"),
  log_reg = lrn("classif.log_reg", predict_type = "prob", id = "log_reg"),
  kknn = lrn("classif.kknn", predict_type = "prob", id = "kknn")
)
# create complete grapg
graph = po("removeconstants", ratio = 0.05) %>>%
  po("branch", options = c("nop_prep", "yeojohnson", "pca", "ica"), id = "prep_branch") %>>%
  gunion(list(po("nop", id = "nop_prep"), po("yeojohnson"), po("pca", scale. = TRUE), po("ica"))) %>>%
  po("unbranch", id = "prep_unbranch") %>>%
  learners %>>%
  po("classifavg", innum = length(learners))

plot(graph)

graph_learner = as_learner(graph)
as.data.table(graph_learner$param_set)[1:70, .(id, class, lower, upper)]
#>                                             id    class lower upper
#>  1:                      removeconstants.ratio ParamDbl     0     1
#>  2:                    removeconstants.rel_tol ParamDbl     0   Inf
#>  3:                    removeconstants.abs_tol ParamDbl     0   Inf
#>  4:                  removeconstants.na_ignore ParamLgl    NA    NA
#>  5:             removeconstants.affect_columns ParamUty    NA    NA
#>  6:                      prep_branch.selection ParamFct    NA    NA
#>  7:                             yeojohnson.eps ParamDbl     0   Inf
#>  8:                     yeojohnson.standardize ParamLgl    NA    NA
#>  9:                           yeojohnson.lower ParamDbl  -Inf   Inf
#> 10:                           yeojohnson.upper ParamDbl  -Inf   Inf
#> 11:                  yeojohnson.affect_columns ParamUty    NA    NA
#> 12:                                 pca.center ParamLgl    NA    NA
#> 13:                                 pca.scale. ParamLgl    NA    NA
#> 14:                                  pca.rank. ParamInt     1   Inf
#> 15:                         pca.affect_columns ParamUty    NA    NA
#> 16:                                 ica.n.comp ParamInt     1   Inf
#> 17:                                ica.alg.typ ParamFct    NA    NA
#> 18:                                    ica.fun ParamFct    NA    NA
#> 19:                                  ica.alpha ParamDbl     1     2
#> 20:                                 ica.method ParamFct    NA    NA
#> 21:                               ica.row.norm ParamLgl    NA    NA
#> 22:                                  ica.maxit ParamInt     1   Inf
#> 23:                                    ica.tol ParamDbl     0   Inf
#> 24:                                ica.verbose ParamLgl    NA    NA
#> 25:                                 ica.w.init ParamUty    NA    NA
#> 26:                         ica.affect_columns ParamUty    NA    NA
#> 27:                        ranger.ranger.alpha ParamDbl  -Inf   Inf
#> 28:       ranger.ranger.always.split.variables ParamUty    NA    NA
#> 29:                ranger.ranger.class.weights ParamUty    NA    NA
#> 30:                      ranger.ranger.holdout ParamLgl    NA    NA
#> 31:                   ranger.ranger.importance ParamFct    NA    NA
#> 32:                   ranger.ranger.keep.inbag ParamLgl    NA    NA
#> 33:                    ranger.ranger.max.depth ParamInt     0   Inf
#> 34:                ranger.ranger.min.node.size ParamInt     1   Inf
#> 35:                     ranger.ranger.min.prop ParamDbl  -Inf   Inf
#> 36:                      ranger.ranger.minprop ParamDbl  -Inf   Inf
#> 37:                         ranger.ranger.mtry ParamInt     1   Inf
#> 38:                   ranger.ranger.mtry.ratio ParamDbl     0     1
#> 39:            ranger.ranger.num.random.splits ParamInt     1   Inf
#> 40:                  ranger.ranger.num.threads ParamInt     1   Inf
#> 41:                    ranger.ranger.num.trees ParamInt     1   Inf
#> 42:                    ranger.ranger.oob.error ParamLgl    NA    NA
#> 43:        ranger.ranger.regularization.factor ParamUty    NA    NA
#> 44:      ranger.ranger.regularization.usedepth ParamLgl    NA    NA
#> 45:                      ranger.ranger.replace ParamLgl    NA    NA
#> 46:    ranger.ranger.respect.unordered.factors ParamFct    NA    NA
#> 47:              ranger.ranger.sample.fraction ParamDbl     0     1
#> 48:                  ranger.ranger.save.memory ParamLgl    NA    NA
#> 49: ranger.ranger.scale.permutation.importance ParamLgl    NA    NA
#> 50:                    ranger.ranger.se.method ParamFct    NA    NA
#> 51:                         ranger.ranger.seed ParamInt  -Inf   Inf
#> 52:         ranger.ranger.split.select.weights ParamUty    NA    NA
#> 53:                    ranger.ranger.splitrule ParamFct    NA    NA
#> 54:                      ranger.ranger.verbose ParamLgl    NA    NA
#> 55:                 ranger.ranger.write.forest ParamLgl    NA    NA
#> 56:                 log_reg.log_reg.dispersion ParamUty    NA    NA
#> 57:                    log_reg.log_reg.epsilon ParamDbl  -Inf   Inf
#> 58:                   log_reg.log_reg.etastart ParamUty    NA    NA
#> 59:                      log_reg.log_reg.maxit ParamDbl  -Inf   Inf
#> 60:                      log_reg.log_reg.model ParamLgl    NA    NA
#> 61:                    log_reg.log_reg.mustart ParamUty    NA    NA
#> 62:                     log_reg.log_reg.offset ParamUty    NA    NA
#> 63:                log_reg.log_reg.singular.ok ParamLgl    NA    NA
#> 64:                      log_reg.log_reg.start ParamUty    NA    NA
#> 65:                      log_reg.log_reg.trace ParamLgl    NA    NA
#> 66:                          log_reg.log_reg.x ParamLgl    NA    NA
#> 67:                          log_reg.log_reg.y ParamLgl    NA    NA
#> 68:                                kknn.kknn.k ParamInt     1   Inf
#> 69:                         kknn.kknn.distance ParamDbl     0   Inf
#> 70:                           kknn.kknn.kernel ParamFct    NA    NA
#>                                             id    class lower upper
search_space = ps(
  # preprocesing
  # interaction_branch.selection = p_fct(levels = c("nop_filter", "modelmatrix")),
  prep_branch.selection = p_fct(levels = c("nop_prep", "yeojohnson", "pca", "ica")),
  pca.rank. = p_int(2, 6, depends = prep_branch.selection == "pca"),
  ica.n.comp = p_int(2, 6, depends = prep_branch.selection == "ica"),
  yeojohnson.standardize = p_lgl(depends = prep_branch.selection == "yeojohnson"),
  # models
  ranger.ranger.mtry.ratio = p_dbl(0.2, 1),
  ranger.ranger.max.depth = p_int(2, 6),
  kknn.kknn.k = p_int(5, 20)
)
# plan("multisession", workers = 4L)
at = auto_tuner(
  method = "random_search",
  learner = graph_learner,
  resampling = rsmp("cv", folds = 3),
  measure = msr("classif.acc"),
  search_space = search_space,
  term_evals = 2L,
  store_models = TRUE
)
at$train(task)
#> INFO  [12:03:59.591] [bbotk] Starting to optimize 7 parameter(s) with '<OptimizerRandomSearch>' and '<TerminatorEvals> [n_evals=2, k=0]' 
#> INFO  [12:03:59.894] [bbotk] Evaluating 1 configuration(s) 
#> INFO  [12:03:59.964] [mlr3] Running benchmark with 3 resampling iterations 
#> INFO  [12:04:00.010] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 3/3) 
#> INFO  [12:04:00.434] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 1/3) 
#> INFO  [12:04:00.834] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 2/3) 
#> INFO  [12:04:01.206] [mlr3] Finished benchmark 
#> INFO  [12:04:01.274] [bbotk] Result of batch 1: 
#> INFO  [12:04:01.276] [bbotk]  prep_branch.selection pca.rank. ica.n.comp yeojohnson.standardize 
#> INFO  [12:04:01.276] [bbotk]               nop_prep        NA         NA                     NA 
#> INFO  [12:04:01.276] [bbotk]  ranger.ranger.mtry.ratio ranger.ranger.max.depth kknn.kknn.k classif.acc 
#> INFO  [12:04:01.276] [bbotk]                 0.2590138                       6           5   0.7419725 
#> INFO  [12:04:01.276] [bbotk]  warnings errors runtime_learners                                uhash 
#> INFO  [12:04:01.276] [bbotk]         0      0            1.169 bdd13c00-6492-4705-9b48-cf97aab78e8d 
#> INFO  [12:04:01.286] [bbotk] Evaluating 1 configuration(s) 
#> INFO  [12:04:01.635] [mlr3] Running benchmark with 3 resampling iterations 
#> INFO  [12:04:01.640] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 3/3) 
#> INFO  [12:04:02.087] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 1/3) 
#> INFO  [12:04:02.539] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 2/3) 
#> INFO  [12:04:02.986] [mlr3] Finished benchmark 
#> INFO  [12:04:03.035] [bbotk] Result of batch 2: 
#> INFO  [12:04:03.036] [bbotk]  prep_branch.selection pca.rank. ica.n.comp yeojohnson.standardize 
#> INFO  [12:04:03.036] [bbotk]             yeojohnson        NA         NA                  FALSE 
#> INFO  [12:04:03.036] [bbotk]  ranger.ranger.mtry.ratio ranger.ranger.max.depth kknn.kknn.k classif.acc 
#> INFO  [12:04:03.036] [bbotk]                 0.3456153                       5          12   0.7509575 
#> INFO  [12:04:03.036] [bbotk]  warnings errors runtime_learners                                uhash 
#> INFO  [12:04:03.036] [bbotk]         0      0            1.322 079a7fb2-f87a-4623-ba62-b5d10a296696 
#> INFO  [12:04:03.060] [bbotk] Finished optimizing after 2 evaluation(s) 
#> INFO  [12:04:03.060] [bbotk] Result: 
#> INFO  [12:04:03.061] [bbotk]  prep_branch.selection pca.rank. ica.n.comp yeojohnson.standardize 
#> INFO  [12:04:03.061] [bbotk]             yeojohnson        NA         NA                  FALSE 
#> INFO  [12:04:03.061] [bbotk]  ranger.ranger.mtry.ratio ranger.ranger.max.depth kknn.kknn.k 
#> INFO  [12:04:03.061] [bbotk]                 0.3456153                       5          12 
#> INFO  [12:04:03.061] [bbotk]  learner_param_vals  x_domain classif.acc 
#> INFO  [12:04:03.061] [bbotk]          <list[13]> <list[5]>   0.7509575
graph = at$learner$graph_model
graph$keep_results = TRUE
graph$predict(task)
#> $classifavg.output
#> <PredictionClassif> for 1000 observations:
#>     row_ids truth response prob.good   prob.bad
#>           1  good     good 0.9426428 0.05735724
#>           2   bad      bad 0.3555267 0.64447326
#>           3  good     good 0.9582165 0.04178351
#> ---                                            
#>         998  good     good 0.9369601 0.06303990
#>         999   bad      bad 0.3653883 0.63461169
#>        1000  good     good 0.7739578 0.22604221
graph$pipeops$kknn.kknn$.result$output
#> <PredictionClassif> for 1000 observations:
#>     row_ids truth response prob.good   prob.bad
#>           1  good     good 1.0000000 0.00000000
#>           2   bad      bad 0.3840454 0.61595460
#>           3  good     good 0.9877564 0.01224364
#> ---                                            
#>         998  good     good 0.9687620 0.03123795
#>         999   bad      bad 0.4657351 0.53426487
#>        1000  good     good 0.9269728 0.07302720
res = graph$predict(task)

Created on 2022-04-03 by the reprex package (v2.0.1)**