Closed topepo closed 2 years ago
Example for testing:
library(tidymodels)
library(agua)
# ------------------------------------------------------------------------------
tidymodels_prefer()
theme_set(theme_bw())
options(pillar.advice = FALSE)
agua::h2o_start()
# ------------------------------------------------------------------------------
n_train <- 200
set.seed(1)
sim_dat <- sim_regression(n_train)
set.seed(2)
sim_rs <- vfold_cv(sim_dat)
# ------------------------------------------------------------------------------
boost_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) %>%
set_engine("h2o") %>%
set_mode("regression")
# ------------------------------------------------------------------------------
# library(doMC)
# registerDoMC(cores = 4)
# or
# library(doParallel)
# cl <- makePSOCKcluster(4)
# registerDoParallel(cl)
h2o_thread_spec <- agua_backend_options(parallelism = 10)
grid_ctrl <- control_grid(backend_options = h2o_thread_spec)
# ------------------------------------------------------------------------------
set.seed(3)
grid_res <-
boost_spec %>%
tune_grid(outcome ~ ., resamples = sim_rs, grid = 10, control = grid_ctrl)
I think that I have it. Running this in the workers before using them seems to stop the errors:
check_workers_h2o <- function() {
library(h2o)
h2o.init()
h2o.clusterIsUp()
}
for example:
library(tidymodels)
library(agua)
#> Registered S3 method overwritten by 'agua':
#> method from
#> tidy.workflow workflows
#>
#> Attaching package: 'agua'
#> The following object is masked from 'package:workflowsets':
#>
#> rank_results
tidymodels_prefer()
theme_set(theme_bw())
options(pillar.advice = FALSE)
n_train <- 200
set.seed(1)
sim_dat <- sim_regression(n_train)
set.seed(2)
sim_rs <- vfold_cv(sim_dat)
boost_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) %>%
set_engine("h2o") %>%
set_mode("regression")
h2o::h2o.init()
#>
#> H2O is not running yet, starting it now...
#>
#> Note: In case of errors look at the following log files:
#> /var/folders/jv/240yf6_94xbcxw2vg7v060pw0000gn/T//RtmpCBt6Kr/filea0781d557226/h2o_max_started_from_r.out
#> /var/folders/jv/240yf6_94xbcxw2vg7v060pw0000gn/T//RtmpCBt6Kr/filea07854b92bd5/h2o_max_started_from_r.err
#>
#>
#> Starting H2O JVM and connecting: .. Connection successful!
#>
#> R is connected to the H2O cluster:
#> H2O cluster uptime: 2 seconds 7 milliseconds
#> H2O cluster timezone: America/New_York
#> H2O data parsing timezone: UTC
#> H2O cluster version: 3.38.0.1
#> H2O cluster version age: 25 days
#> H2O cluster name: H2O_started_from_R_max_niq379
#> H2O cluster total nodes: 1
#> H2O cluster total memory: 14.20 GB
#> H2O cluster total cores: 10
#> H2O cluster allowed cores: 10
#> H2O cluster healthy: TRUE
#> H2O Connection ip: localhost
#> H2O Connection port: 54321
#> H2O Connection proxy: NA
#> H2O Internal Security: FALSE
#> R Version: R version 4.2.0 (2022-04-22)
library(doParallel)
#> Loading required package: foreach
#>
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#>
#> accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
check_workers_h2o <- function() {
library(h2o)
h2o.init()
h2o.clusterIsUp()
}
unlist(parallel::clusterCall(cl, check_workers_h2o))
#> [1] TRUE TRUE TRUE TRUE
h2o_thread_spec <- agua_backend_options(parallelism = 10)
grid_ctrl <- control_grid(backend_options = h2o_thread_spec)
set.seed(3)
grid_res <-
boost_spec %>%
tune_grid(outcome ~ ., resamples = sim_rs, grid = 10, control = grid_ctrl)
show_best(grid_res, metric = "rmse")
#> # A tibble: 5 × 11
#> trees min_n tree_depth learn_rate loss_r…¹ .metric .esti…² mean n std_err
#> <int> <int> <int> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
#> 1 459 21 9 8.28e- 4 2.40e- 1 rmse standa… 14.4 10 0.773
#> 2 1727 13 14 1.59e- 3 6.08e- 6 rmse standa… 18.6 10 0.835
#> 3 1991 26 1 1.91e- 6 1.76e- 9 rmse standa… 19.1 10 0.818
#> 4 800 33 6 1.03e- 8 1.95e- 4 rmse standa… 19.3 10 0.808
#> 5 161 29 3 2.40e-10 1.57e-10 rmse standa… 19.4 10 0.807
#> # … with 1 more variable: .config <chr>, and abbreviated variable names
#> # ¹loss_reduction, ².estimator
stopCluster(cl)
h2o::h2o.shutdown(prompt = FALSE)
Created on 2022-10-14 with reprex v2.0.2
Thanks @topepo ! I'll add this to the tuning vignette.
I started writing a separate vignette as I was going though the work. I'll add a PR and you can adapt it or use as a separate vignette.
This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.
h2o parallelized internally by multithreading the training for an individual model.
We could also use R's external parallelization (via foreach or futures) to send more models to the h2o server at the same time.
We could also use both approaches.
Right now, when using multicore, it just works. For PSOCK clusters, it does not. It produces the error that it cannot find the h2o server.
Can we create a helper that will setup PSOCK clusters so that we can used them? We would need to experiment on what the worker processes are missing. It might be as simple as loading the h2o package in each.