tidymodels / tune

Tools for tidy parameter tuning
https://tune.tidymodels.org
Other
280 stars 42 forks source link

Using parallel processing results in error with tune_grid() #159

Closed pstraforelli closed 4 years ago

pstraforelli commented 4 years ago

As part of the Applied ML course, I have run into the following error when using parallel processing on a Windows machine.

library(tidymodels)
#> -- Attaching packages ---------------------------------------- tidymodels 0.0.3 --
#> v broom     0.5.4     v purrr     0.3.3
#> v dials     0.0.4     v recipes   0.1.9
#> v dplyr     0.8.3     v rsample   0.0.5
#> v ggplot2   3.2.1     v tibble    2.1.3
#> v infer     0.5.1     v yardstick 0.0.5
#> v parsnip   0.0.5
#> -- Conflicts ------------------------------------------- tidymodels_conflicts() --
#> x purrr::discard()    masks scales::discard()
#> x dplyr::filter()     masks stats::filter()
#> x dplyr::lag()        masks stats::lag()
#> x ggplot2::margin()   masks dials::margin()
#> x recipes::step()     masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()
library(tune)
library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
data(Chicago)
Chicago
#> # A tibble: 5,698 x 50
#>    ridership Austin Quincy_Wells Belmont Archer_35th Oak_Park Western Clark_Lake
#>        <dbl>  <dbl>        <dbl>   <dbl>       <dbl>    <dbl>   <dbl>      <dbl>
#>  1     15.7   1.46         8.37     4.60       2.01     1.42    3.32       15.6 
#>  2     15.8   1.50         8.35     4.72       2.09     1.43    3.34       15.7 
#>  3     15.9   1.52         8.36     4.68       2.11     1.49    3.36       15.6 
#>  4     15.9   1.49         7.85     4.77       2.17     1.44    3.36       15.7 
#>  5     15.4   1.50         7.62     4.72       2.06     1.42    3.27       15.6 
#>  6      2.42  0.693        0.911    2.27       0.624    0.426   1.11        2.41
#>  7      1.47  0.408        0.414    1.63       0.378    0.225   0.567       1.37
#>  8     15.5   0.987        4.81     3.52       1.34     0.879   1.94        9.02
#>  9     15.9   1.55         8.23     4.71       2.22     1.46    3.46       16.0 
#> 10     15.9   1.59         8.25     4.77       2.23     1.48    3.51       15.8 
#> # ... with 5,688 more rows, and 42 more variables: Clinton <dbl>,
#> #   Merchandise_Mart <dbl>, Irving_Park <dbl>, Washington_Wells <dbl>,
#> #   Harlem <dbl>, Monroe <dbl>, Polk <dbl>, Ashland <dbl>, Kedzie <dbl>,
#> #   Addison <dbl>, Jefferson_Park <dbl>, Montrose <dbl>, California <dbl>,
#> #   temp_min <dbl>, temp <dbl>, temp_max <dbl>, temp_change <dbl>, dew <dbl>,
#> #   humidity <dbl>, pressure <dbl>, pressure_change <dbl>, wind <dbl>,
#> #   wind_max <dbl>, gust <dbl>, gust_max <dbl>, percip <dbl>, percip_max <dbl>,
#> #   weather_rain <dbl>, weather_snow <dbl>, weather_cloud <dbl>,
#> #   weather_storm <dbl>, Blackhawks_Away <dbl>, Blackhawks_Home <dbl>,
#> #   Bulls_Away <dbl>, Bulls_Home <dbl>, Bears_Away <dbl>, Bears_Home <dbl>,
#> #   WhiteSox_Away <dbl>, WhiteSox_Home <dbl>, Cubs_Away <dbl>, Cubs_Home <dbl>,
#> #   date <date>
stations
#>  [1] "Austin"           "Quincy_Wells"     "Belmont"          "Archer_35th"     
#>  [5] "Oak_Park"         "Western"          "Clark_Lake"       "Clinton"         
#>  [9] "Merchandise_Mart" "Irving_Park"      "Washington_Wells" "Harlem"          
#> [13] "Monroe"           "Polk"             "Ashland"          "Kedzie"          
#> [17] "Addison"          "Jefferson_Park"   "Montrose"         "California"

holidays <- timeDate::listHolidays() %>%
  str_subset("(^US|Easter)")

Chicago_copy <- mutate(Chicago, year = year(date),
                       day = wday(date, label = TRUE, abbr = FALSE),
                       year = year(date))

chi_rec <- recipe(ridership ~ ., data = Chicago_copy) %>% 
  step_holiday(date, holidays = holidays) %>% 
  step_date(date) %>% 
  step_rm(date) %>% 
  step_dummy(all_nominal()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors())

chi_folds <- rolling_origin(Chicago_copy,
                            initial = 364 * 15,
                            assess = 7 * 4,
                            skip = 7 * 4,
                            cumulative = FALSE)

glmn_grid <- expand_grid(
  penalty = 10 ^ seq(-3, -1, length.out = 20),
  mixture = (0:5) / 5)

glmn_mod <- linear_reg(penalty = tune(), mixture = tune()) %>% 
  set_engine("glmnet")

ctrl <- control_grid(save_pred = TRUE)

library(doParallel)
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel

cl <- makeCluster(4)
registerDoParallel(cl)

glmn_tune <- tune_grid(chi_rec,
                       model = glmn_mod,
                       resamples = chi_folds,
                       grid = glmn_grid,
                       control = ctrl)
#> Warning: All models failed in tune_grid(). See the `.notes` column.

stopCluster(cl)
glmn_tune$.notes[[1]]
#> # A tibble: 1 x 1
#>   .notes                                                  
#>   <chr>                                                   
#> 1 "recipe: Error: could not find function \"all_nominal\""
maxheld83 commented 4 years ago

I think this might be a duplicate of #157, or have the same root cause.

topepo commented 4 years ago

The issue is how doParallel loads packages in the workers (or not). I've added a few that aught to cover future issues. The tricky parts is to add enough packages to load without it adding more packages loaded (and attached) in the main R process. The fixes I have only load packages related to foreach:

library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.0.4 ──
#> ✓ broom     0.5.4          ✓ recipes   0.1.9     
#> ✓ dials     0.0.4          ✓ rsample   0.0.5     
#> ✓ dplyr     0.8.4          ✓ tibble    2.1.3     
#> ✓ ggplot2   3.2.1          ✓ tune      0.0.1.9000
#> ✓ infer     0.5.1          ✓ workflows 0.1.0     
#> ✓ parsnip   0.0.5          ✓ yardstick 0.0.5     
#> ✓ purrr     0.3.3
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard()    masks scales::discard()
#> x dplyr::filter()     masks stats::filter()
#> x dplyr::lag()        masks stats::lag()
#> x ggplot2::margin()   masks dials::margin()
#> x recipes::step()     masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()
library(tune)
library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date

data(Chicago, package = "modeldata")

holidays <- timeDate::listHolidays() %>%
  str_subset("(^US|Easter)")

Chicago_copy <- mutate(Chicago, year = year(date),
                       day = wday(date, label = TRUE, abbr = FALSE),
                       year = year(date))

chi_rec <- recipe(ridership ~ ., data = Chicago_copy) %>% 
  step_holiday(date, holidays = holidays) %>% 
  step_date(date) %>% 
  step_rm(date) %>% 
  step_dummy(recipes::all_nominal()) %>% 
  step_zv(recipes::all_predictors()) %>% 
  step_normalize(recipes::all_predictors())

chi_folds <- rolling_origin(Chicago_copy,
                            initial = 364 * 15,
                            assess = 7 * 4,
                            skip = 7 * 4,
                            cumulative = FALSE)

glmn_grid <- expand_grid(
  penalty = 10 ^ seq(-3, -1, length.out = 20),
  mixture = (0:5) / 5)

glmn_mod <- linear_reg(penalty = tune(), mixture = tune()) %>% 
  set_engine("glmnet")

ctrl <- control_grid(save_pred = TRUE)

before_loaded <- names(sessionInfo()$otherPkgs)

library(doParallel)
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
cl <- makeCluster(4)
registerDoParallel(cl)

glmn_tune <- tune_grid(chi_rec,
                       model = glmn_mod,
                       resamples = chi_folds,
                       grid = glmn_grid,
                       control = ctrl)

after_loaded <- names(sessionInfo()$otherPkgs)
after_loaded[!(after_loaded %in% before_loaded)]
#> [1] "doParallel" "iterators"  "foreach"

Created on 2020-02-16 by the reprex package (v0.3.0)

github-actions[bot] commented 3 years ago

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.