tidymodels / stacks

An R package for tidy stacked ensemble modeling
https://stacks.tidymodels.org
Other
295 stars 27 forks source link

Parallelization fails in stacks::control_stack_grid() #186

Closed gundalav closed 1 year ago

gundalav commented 1 year ago

I am attempting to use the tidymodels stacks package to perform ensemble modeling. Following the instructions provided in their article, I was able to reproduce the example successfully.

However, when I added parallelization during hyperparameter tuning for the "knn_res" section of the code:

library(doParallel)
library(parallel)
set.seed(2020)
cls <- makePSOCKcluster(parallelly::availableCores())
registerDoParallel(cls)
knn_res <- 
  tune_grid(
    knn_wflow,
    resamples = folds,
    metrics = metric,
    grid = 4,
    control = ctrl_grid
  )
stopCluster(cls)

I encountered an error when running the "tree_frogs_model_st" section of the code:

tree_frogs_model_st <-
  tree_frogs_data_st %>%
  blend_predictions()

The error message states:

Error in summary.connection(connection) : invalid connection

I believe this issue may be related to the stacks::control_stack_grid() function, but I am unsure of how to resolve it. Please advice.


UPDATE (full reprex)

I excluded the linear model for brevity.

library(tidymodels)
library(stacks)

data("tree_frogs")

# subset the data
tree_frogs <- tree_frogs %>%
  filter(!is.na(latency)) %>%
  select(-c(clutch, hatched))

# some setup: resampling and a basic recipe
set.seed(1)
tree_frogs_split <- initial_split(tree_frogs)
tree_frogs_train <- training(tree_frogs_split)
tree_frogs_test  <- testing(tree_frogs_split)

set.seed(1)
folds <- rsample::vfold_cv(tree_frogs_train, v = 5)

tree_frogs_rec <- 
  recipe(latency ~ ., data = tree_frogs_train)

metric <- metric_set(rmse)

ctrl_grid <- control_stack_grid()
ctrl_res <- control_stack_resamples()

# create a model definition
knn_spec <-
  nearest_neighbor(
    mode = "regression", 
    neighbors = tune("k")
  ) %>%
  set_engine("kknn")

knn_spec
#> K-Nearest Neighbor Model Specification (regression)
#> 
#> Main Arguments:
#>   neighbors = tune("k")
#> 
#> Computational engine: kknn

knn_rec <-
  tree_frogs_rec %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

knn_rec
#> Recipe
#> 
#> Inputs:
#> 
#>       role #variables
#>    outcome          1
#>  predictor          4
#> 
#> Operations:
#> 
#> Dummy variables from all_nominal_predictors()
#> Zero variance filter on all_predictors()
#> Mean imputation for all_numeric_predictors()
#> Centering and scaling for all_numeric_predictors()

knn_wflow <- 
  workflow() %>% 
  add_model(knn_spec) %>%
  add_recipe(knn_rec)

knn_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: nearest_neighbor()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 4 Recipe Steps
#> 
#> • step_dummy()
#> • step_zv()
#> • step_impute_mean()
#> • step_normalize()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> K-Nearest Neighbor Model Specification (regression)
#> 
#> Main Arguments:
#>   neighbors = tune("k")
#> 
#> Computational engine: kknn

library(doParallel)
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
library(parallel)
set.seed(2020)
cls <- makePSOCKcluster(parallelly::availableCores())
registerDoParallel(cls)
knn_res <- 
  tune_grid(
    knn_wflow,
    resamples = folds,
    metrics = metric,
    grid = 4,
    control = ctrl_grid
  )
stopCluster(cls)

knn_res
#> # Tuning results
#> # 5-fold cross-validation 
#> # A tibble: 5 × 5
#>   splits           id    .metrics         .notes           .predictions      
#>   <list>           <chr> <list>           <list>           <list>            
#> 1 <split [343/86]> Fold1 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 2 <split [343/86]> Fold2 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 3 <split [343/86]> Fold3 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 4 <split [343/86]> Fold4 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 5 <split [344/85]> Fold5 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [340 × 5]>

# create a model definition -----
svm_spec <- 
  svm_rbf(
    cost = tune("cost"), 
    rbf_sigma = tune("sigma")
  ) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

# extend the recipe
svm_rec <-
  tree_frogs_rec %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_corr(all_predictors()) %>%
  step_normalize(all_numeric_predictors())

# add both to a workflow
svm_wflow <- 
  workflow() %>% 
  add_model(svm_spec) %>%
  add_recipe(svm_rec)

# tune cost and sigma and fit to the 5-fold cv
set.seed(2020)
cls <- makePSOCKcluster(parallelly::availableCores())
registerDoParallel(cls)
svm_res <- 
  tune_grid(
    svm_wflow, 
    resamples = folds, 
    grid = 6,
    metrics = metric,
    control = ctrl_grid
  )
stopCluster(cls)
svm_res
#> # Tuning results
#> # 5-fold cross-validation 
#> # A tibble: 5 × 5
#>   splits           id    .metrics         .notes           .predictions      
#>   <list>           <chr> <list>           <list>           <list>            
#> 1 <split [343/86]> Fold1 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 2 <split [343/86]> Fold2 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 3 <split [343/86]> Fold3 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 4 <split [343/86]> Fold4 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 5 <split [344/85]> Fold5 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [510 × 6]>

tree_frogs_data_st <- 
  stacks() %>%
  add_candidates(knn_res) %>%
  add_candidates(svm_res)

tree_frogs_data_st
#> # A data stack with 2 model definitions and 10 candidate members:
#> #   knn_res: 4 model configurations
#> #   svm_res: 6 model configurations
#> # Outcome: latency (numeric)

tree_frogs_model_st <-
  tree_frogs_data_st %>%
  blend_predictions()
#> Error in summary.connection(connection): invalid connection

tree_frogs_model_st
#> Error in eval(expr, envir, enclos): object 'tree_frogs_model_st' not found

Created on 2023-01-27 by the reprex package (v2.0.1)

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.1.2 (2021-11-01) #> os Ubuntu 18.04.6 LTS #> system x86_64, linux-gnu #> ui X11 #> language (EN) #> collate C.UTF-8 #> ctype C.UTF-8 #> tz Asia/Tokyo #> date 2023-01-27 #> pandoc 2.14.0.3 @ /usr/lib/rstudio-server/bin/pandoc/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.1.2) #> backports 1.4.1 2021-12-13 [1] CRAN (R 4.1.2) #> broom * 1.0.1 2022-08-29 [1] CRAN (R 4.1.2) #> butcher 0.1.5 2021-06-28 [1] CRAN (R 4.1.2) #> class 7.3-19 2021-05-03 [4] CRAN (R 4.0.5) #> cli 3.6.0 2023-01-09 [1] CRAN (R 4.1.2) #> codetools 0.2-18 2020-11-04 [4] CRAN (R 4.0.3) #> colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.1.2) #> crayon 1.5.1 2022-03-26 [1] CRAN (R 4.1.2) #> DBI 1.1.2 2021-12-20 [1] CRAN (R 4.1.2) #> dials * 1.1.0 2022-11-04 [1] CRAN (R 4.1.2) #> DiceDesign 1.9 2021-02-13 [1] CRAN (R 4.1.2) #> digest 0.6.29 2021-12-01 [2] CRAN (R 4.1.2) #> doParallel * 1.0.17 2022-02-07 [1] CRAN (R 4.1.2) #> dplyr * 1.0.9 2022-04-28 [1] CRAN (R 4.1.2) #> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.2) #> evaluate 0.15 2022-02-18 [1] CRAN (R 4.1.2) #> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.1.2) #> fastmap 1.1.0 2021-01-25 [2] CRAN (R 4.1.2) #> foreach * 1.5.2 2022-02-02 [1] CRAN (R 4.1.2) #> fs 1.5.2 2021-12-08 [1] CRAN (R 4.1.2) #> furrr 0.3.1 2022-08-15 [1] CRAN (R 4.1.2) #> future 1.25.0 2022-04-24 [1] CRAN (R 4.1.2) #> future.apply 1.9.0 2022-04-25 [1] CRAN (R 4.1.2) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.1.2) #> ggplot2 * 3.4.0 2022-11-04 [1] CRAN (R 4.1.2) #> glmnet 4.1-4 2022-04-15 [1] CRAN (R 4.1.2) #> globals 0.15.0 2022-05-09 [1] CRAN (R 4.1.2) #> glue 1.6.2 2022-02-24 [1] CRAN (R 4.1.2) #> gower 1.0.0 2022-02-03 [1] CRAN (R 4.1.2) #> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.1.2) #> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.1.2) #> hardhat 1.2.0 2022-06-30 [1] CRAN (R 4.1.2) #> highr 0.9 2021-04-16 [1] CRAN (R 4.1.2) #> htmltools 0.5.2 2021-08-25 [2] CRAN (R 4.1.2) #> igraph 1.3.1 2022-04-20 [1] CRAN (R 4.1.2) #> infer * 1.0.0 2021-08-13 [1] CRAN (R 4.1.2) #> ipred 0.9-12 2021-09-15 [1] CRAN (R 4.1.2) #> iterators * 1.0.14 2022-02-05 [1] CRAN (R 4.1.2) #> kernlab 0.9-30 2022-04-02 [1] CRAN (R 4.1.2) #> kknn 1.3.1 2016-03-26 [1] CRAN (R 4.1.2) #> knitr 1.38 2022-03-25 [1] CRAN (R 4.1.2) #> lattice 0.20-45 2021-09-22 [4] CRAN (R 4.1.1) #> lava 1.6.10 2021-09-02 [1] CRAN (R 4.1.2) #> lhs 1.1.5 2022-03-22 [1] CRAN (R 4.1.2) #> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.1.2) #> listenv 0.8.0 2019-12-05 [1] CRAN (R 4.1.2) #> lubridate 1.8.0 2021-10-07 [1] CRAN (R 4.1.2) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.1.2) #> MASS 7.3-54 2021-05-03 [4] CRAN (R 4.0.5) #> Matrix 1.3-4 2021-06-01 [4] CRAN (R 4.1.0) #> modeldata * 0.1.1 2021-07-14 [1] CRAN (R 4.1.2) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.1.2) #> nnet 7.3-16 2021-05-03 [4] CRAN (R 4.0.5) #> parallelly 1.31.1 2022-04-22 [1] CRAN (R 4.1.2) #> parsnip * 1.0.3 2022-11-11 [1] CRAN (R 4.1.2) #> pillar 1.7.0 2022-02-01 [1] CRAN (R 4.1.2) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.1.2) #> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.1.2) #> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.1.2) #> R.cache 0.15.0 2021-04-30 [1] CRAN (R 4.1.2) #> R.methodsS3 1.8.1 2020-08-26 [1] CRAN (R 4.1.2) #> R.oo 1.24.0 2020-08-26 [1] CRAN (R 4.1.2) #> R.utils 2.11.0 2021-09-26 [1] CRAN (R 4.1.2) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.1.2) #> Rcpp 1.0.10 2023-01-22 [1] CRAN (R 4.1.2) #> recipes * 1.0.3 2022-11-09 [1] CRAN (R 4.1.2) #> reprex 2.0.1 2021-08-05 [1] CRAN (R 4.1.2) #> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.1.2) #> rmarkdown 2.13 2022-03-10 [1] CRAN (R 4.1.2) #> rpart 4.1-15 2019-04-12 [4] CRAN (R 4.0.0) #> rsample * 1.1.1 2022-12-07 [1] CRAN (R 4.1.2) #> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.1.2) #> scales * 1.2.0 2022-04-13 [1] CRAN (R 4.1.2) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.1.2) #> shape 1.4.6 2021-05-19 [1] CRAN (R 4.1.2) #> stacks * 1.0.1 2022-12-14 [1] CRAN (R 4.1.2) #> stringi 1.7.6 2021-11-29 [1] CRAN (R 4.1.2) #> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.1.2) #> styler 1.7.0 2022-03-13 [1] CRAN (R 4.1.2) #> survival 3.2-13 2021-08-24 [4] CRAN (R 4.1.1) #> tibble * 3.1.7 2022-05-03 [1] CRAN (R 4.1.2) #> tidymodels * 0.2.0 2022-03-19 [1] CRAN (R 4.1.2) #> tidyr * 1.2.0 2022-02-01 [1] CRAN (R 4.1.2) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.1.2) #> timeDate 3043.102 2018-02-21 [1] CRAN (R 4.1.2) #> tune * 1.0.1 2022-10-09 [1] CRAN (R 4.1.2) #> usethis 2.1.5 2021-12-09 [1] CRAN (R 4.1.2) #> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.1.2) #> vctrs 0.5.1 2022-11-16 [1] CRAN (R 4.1.2) #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.1.2) #> workflows * 1.1.2 2022-11-16 [1] CRAN (R 4.1.2) #> workflowsets * 0.2.1 2022-03-15 [1] CRAN (R 4.1.2) #> xfun 0.31 2022-05-10 [1] CRAN (R 4.1.2) #> yaml 2.3.5 2022-02-21 [1] CRAN (R 4.1.2) #> yardstick * 1.1.0 2022-09-07 [1] CRAN (R 4.1.2) #> #> [1] /home/ubuntu/R/x86_64-pc-linux-gnu-library/4.1 #> [2] /usr/local/lib/R/site-library #> [3] /usr/lib/R/site-library #> [4] /usr/lib/R/library #> #> ────────────────────────────────────────────────────────────────────────────── ```
simonpcouch commented 1 year ago

Thanks for the issue! Some discussion on the analogous SO post—will close in favor of https://github.com/tidymodels/tune/issues/606.

github-actions[bot] commented 1 year ago

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.