mlr-org / mlr3pipelines

Dataflow Programming for Machine Learning in R
https://mlr3pipelines.mlr-org.com/
GNU Lesser General Public License v3.0
137 stars 25 forks source link

Error in !convert_categorical : invalid argument type. #715

Closed melonki closed 1 year ago

melonki commented 1 year ago

Dear Contributors and Members,

Thank you for your great efforts in developing mlr3pipelines! It accelerated my work a lot. However, when I try to rerun the code which I wrote 4 months ago, it doesn't work. The error was "Error in !convert_categorical : invalid argument type This happened PipeOp lgb's $train()", but there was no categorical variable in my dataset at all. I guess the error was caused by the change in the R environment since I have updated several packages. But I don't know how to peek into the graph learner and find the exact cause. Below please find the reproducible example and the environment where I ran the code. PS: the lightgbm package works fine when not incorporated in an mlr3 workflow in my workstation.

I would appreciate it if you would provide any suggestions! Thank you in advance for your help!

Best, Melonki

library(mlr3)
library(mlr3verse)
library(mlr3learners)
library(mlr3extralearners)
library(mlr3misc)
library(paradox)
library(mlr3fselect)
library(mlr3tuning)
library(mlr3pipelines)
library(mlr3filters)
library(praznik)
library(mlbench)
library(tidyverse)

library(lightgbm)

data("BostonHousing")
BostonHousing <- BostonHousing %>% 
  dplyr::select(-all_of(c("chas")))
task_bh <- TaskRegr$new(id = "bh", backend = BostonHousing, target = "medv")
# <TaskRegr:bh> (506 x 13)
# * Target: medv
# * Properties: -
#   * Features (12):
#   - dbl (12): age, b, crim, dis, indus, lstat, nox, ptratio, rad, rm, tax, zn

graph =
  po("scale") %>>%
  lrn("regr.lightgbm",id = "lgb",predict_type = "response")

# graph$plot()
graph$param_set$ids()

ps = list(
  scale.robust = TRUE,
  lgb.learning_rate = 0.1,
  lgb.num_leaves = 63,
  lgb.max_depth = 6,
  lgb.min_data_in_leaf = 10,
  lgb.bagging_fraction = 0.7,
  lgb.bagging_freq = 4,
  lgb.feature_fraction = 0.8,
  lgb.lambda_l1 = 0.5,
  lgb.lambda_l2 = 0.5,
  lgb.max_bin = 63
)
graph$param_set$values = ps

glrn = GraphLearner$new(graph)
# <GraphLearner:scale.lgb>
#   * Model: -
#   * Parameters: lgb.learning_rate=0.1, lgb.num_leaves=63, lgb.max_depth=6, lgb.min_data_in_leaf=10, lgb.bagging_fraction=0.7,
# lgb.bagging_freq=4, lgb.feature_fraction=0.8, lgb.lambda_l1=0.5, lgb.lambda_l2=0.5, lgb.max_bin=63
# * Packages: mlr3, mlr3pipelines, mlr3extralearners, lightgbm
# * Predict Types:  [response], se, distr
# * Feature Types: logical, integer, numeric, character, factor, ordered, POSIXct
# * Properties: featureless, hotstart_backward, hotstart_forward, importance, loglik, missings, oob_error, selected_features, weights

glrn$train(task_bh)
# Error in !convert_categorical : invalid argument type
# This happened PipeOp lgb's $train()

traceback()
# 34: stop(e)
# 33: h(simpleError(msg, call))
# 32: .handleSimpleError(function (e) 
# {
#   e$message = sprintf("%s\nThis happened PipeOp %s's $train()", 
#                       e$message, self$id)
#   stop(e)
# }, "invalid argument type", base::quote(!convert_categorical))
# 31: train_lightgbm(self, task, "regr", pars)
# 30: .__LearnerRegrLightGBM__.train(self = self, private = private, 
#                                    super = super, task = task)
# 29: get_private(learner)$.train(task)
# 28: .f(learner = <environment>, task = <environment>)
# 27: eval(expr, p)
# 26: eval(expr, p)
# 25: eval.parent(expr, n = 1L)
# 24: invoke(.f, .args = .args, .opts = .opts, .seed = .seed, .timeout = .timeout)
# 23: encapsulate(learner$encapsulate["train"], .f = train_wrapper, 
#                 .args = list(learner = learner, task = task), .pkgs = learner$packages, 
#                 .seed = NA_integer_, .timeout = learner$timeout["train"])
# 22: learner_train(learner, task, train_row_ids = row_ids, test_row_ids = test_row_ids, 
#                   mode = mode)
# 21: .__Learner__train(self = self, private = private, super = super, 
#                       task = task, row_ids = row_ids)
# 20: private$.learner$train(task)
# 19: .__PipeOpLearner__.train(self = self, private = private, super = super, 
#                              inputs = inputs)
# 18: private$.train(input)
# 17: withCallingHandlers({
#   output = private$.train(input)
# }, error = function(e) {
#   e$message = sprintf("%s\nThis happened PipeOp %s's $train()", 
#                       e$message, self$id)
#   stop(e)
# }, warning = function(w) {
#   w$message = sprintf("%s\nThis happened PipeOp %s's $train()", 
#                       w$message, self$id)
#   warning(w)
#   invokeRestart("muffleWarning")
# })
# 16: .__PipeOp__train(self = self, private = private, super = super, 
#                      input = input)
# 15: op[[fun]](input)
# 14: graph_reduce(self, input, "train", single_input)
# 13: .__Graph__train(self = self, private = private, super = super, 
#                     input = input, single_input = single_input)
# 12: self$graph$train(task)
# 11: .__GraphLearner__.train(self = self, private = private, super = super, 
#                             task = task)
# 10: get_private(learner)$.train(task)
# 9: .f(learner = <environment>, task = <environment>)
# 8: eval(expr, p)
# 7: eval(expr, p)
# 6: eval.parent(expr, n = 1L)
# 5: invoke(.f, .args = .args, .opts = .opts, .seed = .seed, .timeout = .timeout)
# 4: encapsulate(learner$encapsulate["train"], .f = train_wrapper, 
#                .args = list(learner = learner, task = task), .pkgs = learner$packages, 
#                .seed = NA_integer_, .timeout = learner$timeout["train"])
# 3: learner_train(learner, task, train_row_ids = row_ids, test_row_ids = test_row_ids, 
#                  mode = mode)
# 2: .__Learner__train(self = self, private = private, super = super, 
#                      task = task, row_ids = row_ids)
# 1: glrn$train(task_bh)

sessionInfo()
# R version 4.2.2 (2022-10-31 ucrt)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows 10 x64 (build 19044)
# 
# Matrix products: default
# 
# locale:
#   [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8    LC_MONETARY=English_United States.utf8
# [4] LC_NUMERIC=C                           LC_TIME=English_United States.utf8    
# 
# attached base packages:
#   [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#   [1] lubridate_1.9.2         forcats_1.0.0           stringr_1.5.0           dplyr_1.1.0             purrr_1.0.1             readr_2.1.4            
# [7] tidyr_1.3.0             tibble_3.1.8            ggplot2_3.4.1           tidyverse_2.0.0         lightgbm_3.3.5          R6_2.5.1               
# [13] mlbench_2.1-3           praznik_11.0.0          mlr3filters_0.7.1       mlr3pipelines_0.4.2     mlr3tuning_0.18.0       mlr3fselect_0.11.0     
# [19] paradox_0.11.0          mlr3misc_0.11.0         mlr3extralearners_0.6.1 mlr3learners_0.5.6      mlr3verse_0.2.7         mlr3_0.14.1            
# 
# loaded via a namespace (and not attached):
#   [1] lattice_0.20-45        listenv_0.9.0          palmerpenguins_0.1.1   digest_0.6.31          utf8_1.2.3             parallelly_1.34.0     
# [7] backports_1.4.1        pillar_1.8.1           rlang_1.0.6            uuid_1.1-0             rstudioapi_0.14        data.table_1.14.8     
# [13] Matrix_1.5-1           checkmate_2.1.0        munsell_0.5.0          compiler_4.2.2         pkgconfig_2.0.3        globals_0.16.2        
# [19] tidyselect_1.2.0       gridExtra_2.3          mlr3data_0.6.1         lgr_0.4.4              mlr3cluster_0.1.6      mlr3tuningspaces_0.3.5
# [25] codetools_0.2-18       clusterCrit_1.2.8      fansi_1.0.4            future_1.32.0          crayon_1.5.2           tzdb_0.3.0            
# [31] withr_2.5.0            grid_4.2.2             jsonlite_1.8.4         gtable_0.3.1           lifecycle_1.0.3        magrittr_2.0.3        
# [37] scales_1.2.1           stringi_1.7.12         cli_3.6.0              mlr3viz_0.6.1          ellipsis_0.3.2         bbotk_0.7.2           
# [43] generics_0.1.3         vctrs_0.5.2            xgboost_1.7.3.1        tools_4.2.2            glue_1.6.2             hms_1.1.2             
# [49] parallel_4.2.2         timechange_0.2.0       clue_0.3-64            colorspace_2.1-0       cluster_2.1.4
melonki commented 1 year ago

I have found a solution to this problem. It seems that the default values of hyperparameters A could be dropped when assigning values to hyperparameters B. In this case, I need to assign values to both A and B. ps = list( scale.robust = TRUE, lgb.convert_categorical = TRUE, lgb.objective = "regression", lgb.learning_rate = 0.1, lgb.num_leaves = 63, lgb.max_depth = 6, lgb.min_data_in_leaf = 10, lgb.bagging_fraction = 0.7, lgb.bagging_freq = 4, lgb.feature_fraction = 0.8, lgb.lambda_l1 = 0.5, lgb.lambda_l2 = 0.5, lgb.max_bin = 63 )

mb706 commented 1 year ago

Yes, assigning hyperparameter values using ...$values = list(...) is not recommended becasue of these kinds of problems.

To set multiple hyperparametervalues at once without having to type a lot, paradox now has the $set_values() function. You could do

graph$param_set$set_values(.values = ps)

or just list all the parameters in the function call directly

graph$param_set$set_values(
  scale.robust = TRUE,
  lgb.learning_rate = 0.1,
  lgb.num_leaves = 63,
   ...
)