a-hanf / mlr3automl

Automated machine learning in mlr3
GNU Lesser General Public License v3.0
25 stars 4 forks source link

Reproducibility Issue With Parallel Processing? #22

Open dcbarnard opened 3 years ago

dcbarnard commented 3 years ago

Here is code where I would have expected the aggregate results at the end for two identical benchmarks to be identical, but they are not. Since I am only an intermediate level coder in R, perhaps there is something wrong with my code. In any event, I pass this along for your consideration as a possible issue in mlr3automl. As you can imagine, this code takes a while to execute, ~10 minutes on my iMac Pro.

#############################################################
# Cross-validating the regression learners
#############################################################

library("doFuture")
library("doRNG")
library("future")
library("future.apply")
library("mlr3verse")
library("mlr3automl")
library("mlr3hyperband")

# set logger thresholds

lgr::get_logger("mlr3")$set_threshold("error")
lgr::get_logger("bbotk")$set_threshold("error")

# specify regression learners

learners = list(
  lrn(
    "regr.featureless",
    id = "fl"
  ),
  lrn(
    "regr.lm",
    id = "lm"
  ),
  lrn(
    "regr.cv_glmnet",
    id = "glm"
  ),
  lrn(
    "regr.ranger",
    id = "rf"
  ),
  lrn(
    "regr.xgboost",
    id = "xgb"
  ),
  lrn(
    "regr.svm",
    id = "svm"
  )
)

learner_ids = sapply(
  learners,
  function(x) x$id
)

# define regression task

task = tsk("boston_housing")

# select small subset of features

task$select(c("age", "crim", "lat", "lon"))

# specify resampling

resampling = rsmp("cv")

# specify measure

measure = msr("regr.mse")

# autotuners for models with hyperparameters

learners[[3]] = create_autotuner(
  learner = lrn("regr.cv_glmnet"),
  tuner = tnr("hyperband")
)

learners[[4]] = create_autotuner(
  learner = lrn("regr.ranger"),
  tuner = tnr("hyperband"),
  num_effective_vars = length(
    task$feature_names
  )
)

learners[[5]] = create_autotuner(
  learner = lrn("regr.xgboost"),
  tuner = tnr("hyperband")
)

learners[[6]] = create_autotuner(
  learner = lrn("regr.svm"),
  tuner = tnr("hyperband")
)

# create benchmark grid

design = benchmark_grid(
  tasks = task,
  learners = learners,
  resamplings = resampling
)

# start parallel processing

registerDoFuture()
plan(multisession, workers = availableCores() - 1)
registerDoRNG(123456)

# execute benchmark

bmr1 = mlr3::benchmark(design)

# terminate parallel processing

plan(sequential)

# start parallel processing

registerDoFuture()
plan(multisession, workers = availableCores() - 1)
registerDoRNG(123456)

# execute benchmark

bmr2 = mlr3::benchmark(design)

# terminate parallel processing

plan(sequential)

# test for reproducibility

bmr1$aggregate()$regr.mse == bmr2$aggregate()$regr.mse

Here are a couple of interesting clues. If I run this code several times, the end result is the same each time (i.e., the same mix of TRUE and FALSE results for the different stochastic learners). But if I run this code in R and then run the same code in RStudio, I get a different mix of TRUE and FALSE results depending on the platform. Finally, if I run this code substituting a different dataset, then I get a different mix of TRUE and FALSE results at the end.

a-hanf commented 3 years ago

Thanks for the report!

I am not sure this has anything to do with mlr3automl, but I am looking into it. I'll give you an update when I know more

dcbarnard commented 3 years ago

As an experiment, I removed the lines relating to the create_autotuners, substituted a simple random autotuner for the ranger learner, and left the other learners untuned. This modified code gives reproducible results for the tuned ranger learner (as well as for the other untuned learners).

#############################################################
# Cross-validating the regression learners
#############################################################

library("doFuture")
library("doRNG")
library("future")
library("future.apply")
library("mlr3verse")
library("mlr3automl")
library("mlr3hyperband")

# set logger thresholds

lgr::get_logger("mlr3")$set_threshold("error")
lgr::get_logger("bbotk")$set_threshold("error")

# specify regression learners

learners = list(
  lrn(
    "regr.featureless",
    id = "fl"
  ),
  lrn(
    "regr.lm",
    id = "lm"
  ),
  lrn(
    "regr.cv_glmnet",
    id = "glm"
  ),
  lrn(
    "regr.ranger",
    id = "rf"
  ),
  lrn(
    "regr.xgboost",
    id = "xgb"
  ),
  lrn(
    "regr.svm",
    id = "svm"
  )
)

learner_ids = sapply(
  learners,
  function(x) x$id
)

# define regression task

task = tsk("boston_housing")

# select small subset of features

task$select(c("age", "crim", "lat", "lon"))

# specify resampling

resampling = rsmp("cv")

# specify measure

measure = msr("regr.mse")

# specify random tuning of ranger learner

search_space = ParamSet$new(
  params = list(
    ParamInt$new("mtry", lower = 1L, upper = 4L),
    ParamDbl$new("sample.fraction", lower = 0.2, upper = 0.9),
    ParamInt$new("min.node.size", lower = 1L, upper = 10L)
  )
)

terminator = trm("evals", n_evals = 10)

tuner = tnr("random_search")

learners[[4]] = AutoTuner$new(
  learner = learners[[4]],
  resampling = rsmp("holdout"),
  measure = measure,
  search_space = search_space,
  terminator = terminator,
  tuner = tuner
)

# create benchmark grid

design = benchmark_grid(
  tasks = task,
  learners = learners,
  resamplings = resampling
)

# start parallel processing

registerDoFuture()
plan(multisession, workers = availableCores() - 1)
registerDoRNG(123456)

# execute benchmark

bmr1 = mlr3::benchmark(design)

# terminate parallel processing

plan(sequential)

# start parallel processing

registerDoFuture()
plan(multisession, workers = availableCores() - 1)
registerDoRNG(123456)

# execute benchmark

bmr2 = mlr3::benchmark(design)

# terminate parallel processing

plan(sequential)

# test for reproducibility

bmr1$aggregate()$regr.mse == bmr2$aggregate()$regr.mse

# clean up the work space

rm(design, learners, bmr1, bmr2, learner_ids,
   measure, resampling, search_space, task,
   terminator, tuner)

This result makes me think the choice of autotuner goes to the difference in terms of reproducibility.