DavisVaughan / furrr

Apply Mapping Functions in Parallel using Futures
https://furrr.futureverse.org/
Other
699 stars 40 forks source link

future_map errors where mclapply does not #164

Closed andrjohns closed 4 years ago

andrjohns commented 4 years ago

Got a bit of an odd one. I'm having problems with a worker consistently crashing when the code is run via future_map (under both multisession and multicore), but no crashes when the code is run via mclapply.

I'm running on Kubuntu 20.04 (session info at end), with 64GB of RAM.

Here is the reprex for the multisession run through RStudio:

library(tidyverse)
library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom     0.7.0      ✓ recipes   0.1.13
#> ✓ dials     0.0.8      ✓ rsample   0.0.7 
#> ✓ infer     0.5.3      ✓ tune      0.1.1 
#> ✓ modeldata 0.0.2      ✓ workflows 0.1.3 
#> ✓ parsnip   0.1.3      ✓ yardstick 0.0.7
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter()   masks stats::filter()
#> x recipes::fixed()  masks stringr::fixed()
#> x dplyr::lag()      masks stats::lag()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step()   masks stats::step()
library(furrr)
#> Loading required package: future

dat = cbind(structure(rnorm(200*16),dim=c(200,16)),
            structure(sample(0:1,200*14,replace = T),dim=c(200,14))) %>%
        data.frame() %>%
        setNames(c("y",paste0("x",1:29)))

nestcv_dat <- nested_cv(dat, 
                     outside = vfold_cv(repeats = 2), 
                     inside = vfold_cv(repeats = 2))

lasso_rmse <- function(penalty,mixture,object) {
  mod <- 
    linear_reg(mode = "regression", penalty = penalty,mixture=mixture) %>% 
    set_engine("glmnet") %>% 
    fit(y ~ ., data = analysis(object))

  holdout_pred =
  predict(mod, assessment(object) %>% dplyr::select(-y)) %>% 
    bind_cols(assessment(object) %>% dplyr::select(y),penalty=penalty)
    rmse(holdout_pred, truth = y, estimate = .pred)$.estimate
}

tune_over_penalty <- function(object) {
  tmp_fun = function(i,pen,mix,object){lasso_rmse(pen[i],mix[i],object)} 
  tibble(expand.grid(penalty = seq(0.01,5,length=20),mixture = seq(0,1,0.2))) %>% 
    mutate(RMSE = sapply(1:nrow(.),tmp_fun,penalty, mixture, object))
}

summarize_tune_results <- function(object) {
  map_df(object$splits, tune_over_penalty) %>%
    group_by(penalty,mixture) %>%
    summarize(mean_RMSE = mean(RMSE, na.rm = TRUE),
              n = length(RMSE),
              .groups = "drop")
}

plan(multisession)
tuning_results <- future_map(nestcv_dat$inner_resamples, summarize_tune_results)
#> Error in unserialize(node$con): Failed to retrieve the value of MultisessionFuture (<none>) from cluster RichSOCKnode #5 (PID 30132 on localhost 'localhost'). The reason reported was 'error reading from connection'. Post-mortem diagnostic: No process exists with this PID, i.e. the localhost worker is no longer alive.

Created on 2020-08-24 by the reprex package (v0.3.0)

This is the reprex from the multicore run via the command line:

library(tidyverse)
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1 ──
#> ✔ broom     0.7.0      ✔ recipes   0.1.13
#> ✔ dials     0.0.8      ✔ rsample   0.0.7 
#> ✔ infer     0.5.3      ✔ tune      0.1.1 
#> ✔ modeldata 0.0.2      ✔ workflows 0.1.3 
#> ✔ parsnip   0.1.3      ✔ yardstick 0.0.7
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> ✖ scales::discard() masks purrr::discard()
#> ✖ dplyr::filter()   masks stats::filter()
#> ✖ recipes::fixed()  masks stringr::fixed()
#> ✖ dplyr::lag()      masks stats::lag()
#> ✖ yardstick::spec() masks readr::spec()
#> ✖ recipes::step()   masks stats::step()
library(furrr)
#> Loading required package: future

dat = cbind(structure(rnorm(200*16),dim=c(200,16)),
            structure(sample(0:1,200*14,replace = T),dim=c(200,14))) %>%
        data.frame() %>%
        setNames(c("y",paste0("x",1:29)))

nestcv_dat <- nested_cv(dat, 
                     outside = vfold_cv(repeats = 2), 
                     inside = vfold_cv(repeats = 2))

lasso_rmse <- function(penalty,mixture,object) {
  mod <- 
    linear_reg(mode = "regression", penalty = penalty,mixture=mixture) %>% 
    set_engine("glmnet") %>% 
    fit(y ~ ., data = analysis(object))

  holdout_pred =
  predict(mod, assessment(object) %>% dplyr::select(-y)) %>% 
    bind_cols(assessment(object) %>% dplyr::select(y),penalty=penalty)
    rmse(holdout_pred, truth = y, estimate = .pred)$.estimate
}

tune_over_penalty <- function(object) {
  tmp_fun = function(i,pen,mix,object){lasso_rmse(pen[i],mix[i],object)} 
  tibble(expand.grid(penalty = seq(0.01,5,length=20),mixture = seq(0,1,0.2))) %>% 
    mutate(RMSE = sapply(1:nrow(.),tmp_fun,penalty, mixture, object))
}

summarize_tune_results <- function(object) {
  map_df(object$splits, tune_over_penalty) %>%
    group_by(penalty,mixture) %>%
    summarize(mean_RMSE = mean(RMSE, na.rm = TRUE),
              n = length(RMSE),
              .groups = "drop")
}

plan(multicore)
tuning_results <- future_map(nestcv_dat$inner_resamples, summarize_tune_results)
#> Warning in mccollect(jobs = jobs, wait = TRUE): 1 parallel job did not deliver a
#> result
#> Error: Failed to retrieve the result of MulticoreFuture (<none>) from the forked worker (on localhost; PID 32542). Post-mortem diagnostic: No process exists with this PID, i.e. the forked localhost worker is no longer alive.

Created on 2020-08-24 by the reprex package (v0.3.0)

Finally, this is the reprex from the mclapply run (also via commandline), which completes without error:

library(tidyverse)
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1 ──
#> ✔ broom     0.7.0      ✔ recipes   0.1.13
#> ✔ dials     0.0.8      ✔ rsample   0.0.7 
#> ✔ infer     0.5.3      ✔ tune      0.1.1 
#> ✔ modeldata 0.0.2      ✔ workflows 0.1.3 
#> ✔ parsnip   0.1.3      ✔ yardstick 0.0.7
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> ✖ scales::discard() masks purrr::discard()
#> ✖ dplyr::filter()   masks stats::filter()
#> ✖ recipes::fixed()  masks stringr::fixed()
#> ✖ dplyr::lag()      masks stats::lag()
#> ✖ yardstick::spec() masks readr::spec()
#> ✖ recipes::step()   masks stats::step()
library(parallel)

dat = cbind(structure(rnorm(200*16),dim=c(200,16)),
            structure(sample(0:1,200*14,replace = T),dim=c(200,14))) %>%
        data.frame() %>%
        setNames(c("y",paste0("x",1:29)))

nestcv_dat <- nested_cv(dat, 
                     outside = vfold_cv(repeats = 2), 
                     inside = vfold_cv(repeats = 2))

lasso_rmse <- function(penalty,mixture,object) {
  mod <- 
    linear_reg(mode = "regression", penalty = penalty,mixture=mixture) %>% 
    set_engine("glmnet") %>% 
    fit(y ~ ., data = analysis(object))

  holdout_pred =
  predict(mod, assessment(object) %>% dplyr::select(-y)) %>% 
    bind_cols(assessment(object) %>% dplyr::select(y),penalty=penalty)
    rmse(holdout_pred, truth = y, estimate = .pred)$.estimate
}

tune_over_penalty <- function(object) {
  tmp_fun = function(i,pen,mix,object){lasso_rmse(pen[i],mix[i],object)} 
  tibble(expand.grid(penalty = seq(0.01,5,length=20),mixture = seq(0,1,0.2))) %>% 
    mutate(RMSE = sapply(1:nrow(.),tmp_fun,penalty, mixture, object))
}

summarize_tune_results <- function(object) {
  map_df(object$splits, tune_over_penalty) %>%
    group_by(penalty,mixture) %>%
    summarize(mean_RMSE = mean(RMSE, na.rm = TRUE),
              n = length(RMSE),
              .groups = "drop")
}

tuning_results <- mclapply(nestcv_dat$inner_resamples,summarize_tune_results,mc.cores=16)

Created on 2020-08-24 by the reprex package (v0.3.0)

Also for reference, my session info:

Session info ``` r devtools::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.0.2 (2020-06-22) #> os Ubuntu 20.04.1 LTS #> system x86_64, linux-gnu #> ui X11 #> language en_AU:en #> collate en_AU.UTF-8 #> ctype en_AU.UTF-8 #> tz Australia/Perth #> date 2020-08-24 #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date lib source #> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.2) #> backports 1.1.8 2020-06-17 [1] CRAN (R 4.0.2) #> blob 1.2.1 2020-01-20 [1] CRAN (R 4.0.2) #> broom * 0.7.0 2020-07-09 [1] CRAN (R 4.0.2) #> callr 3.4.3 2020-03-28 [1] CRAN (R 4.0.2) #> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.2) #> class 7.3-17 2020-04-26 [4] CRAN (R 4.0.0) #> cli 2.0.2 2020-02-28 [1] CRAN (R 4.0.2) #> codetools 0.2-16 2018-12-24 [4] CRAN (R 4.0.0) #> colorspace 1.4-1 2019-03-18 [1] CRAN (R 4.0.2) #> crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.2) #> DBI 1.1.0 2019-12-15 [1] CRAN (R 4.0.2) #> dbplyr 1.4.4 2020-05-27 [1] CRAN (R 4.0.2) #> desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.2) #> devtools 2.3.1 2020-07-21 [1] CRAN (R 4.0.2) #> dials * 0.0.8 2020-07-08 [1] CRAN (R 4.0.2) #> DiceDesign 1.8-1 2019-07-31 [1] CRAN (R 4.0.2) #> digest 0.6.25 2020-02-23 [1] CRAN (R 4.0.2) #> dplyr * 1.0.2 2020-08-18 [1] CRAN (R 4.0.2) #> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.2) #> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.2) #> fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.2) #> forcats * 0.5.0 2020-03-01 [1] CRAN (R 4.0.2) #> foreach 1.5.0 2020-03-30 [1] CRAN (R 4.0.2) #> fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.2) #> furrr * 0.1.0 2018-05-16 [1] CRAN (R 4.0.2) #> future * 1.18.0 2020-07-09 [1] CRAN (R 4.0.2) #> generics 0.0.2 2018-11-29 [1] CRAN (R 4.0.2) #> ggplot2 * 3.3.2 2020-06-19 [1] CRAN (R 4.0.2) #> globals 0.12.5 2019-12-07 [1] CRAN (R 4.0.2) #> glue 1.4.1 2020-05-13 [1] CRAN (R 4.0.2) #> gower 0.2.2 2020-06-23 [1] CRAN (R 4.0.2) #> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.0.2) #> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.2) #> haven 2.3.1 2020-06-01 [1] CRAN (R 4.0.2) #> highr 0.8 2019-03-20 [1] CRAN (R 4.0.2) #> hms 0.5.3 2020-01-08 [1] CRAN (R 4.0.2) #> htmltools 0.5.0 2020-06-16 [1] CRAN (R 4.0.2) #> httr 1.4.2 2020-07-20 [1] CRAN (R 4.0.2) #> infer * 0.5.3 2020-07-14 [1] CRAN (R 4.0.2) #> ipred 0.9-9 2019-04-28 [1] CRAN (R 4.0.2) #> iterators 1.0.12 2019-07-26 [1] CRAN (R 4.0.2) #> jsonlite 1.7.0 2020-06-25 [1] CRAN (R 4.0.2) #> knitr 1.29 2020-06-23 [1] CRAN (R 4.0.2) #> lattice 0.20-41 2020-04-02 [4] CRAN (R 4.0.0) #> lava 1.6.7 2020-03-05 [1] CRAN (R 4.0.2) #> lhs 1.0.2 2020-04-13 [1] CRAN (R 4.0.2) #> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.2) #> listenv 0.8.0 2019-12-05 [1] CRAN (R 4.0.2) #> lubridate 1.7.9 2020-06-08 [1] CRAN (R 4.0.2) #> magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.2) #> MASS 7.3-51.6 2020-04-26 [4] CRAN (R 4.0.0) #> Matrix 1.2-18 2019-11-27 [4] CRAN (R 4.0.0) #> memoise 1.1.0 2017-04-21 [1] CRAN (R 4.0.2) #> modeldata * 0.0.2 2020-06-22 [1] CRAN (R 4.0.2) #> modelr 0.1.8 2020-05-19 [1] CRAN (R 4.0.2) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.2) #> nnet 7.3-14 2020-04-26 [4] CRAN (R 4.0.0) #> parsnip * 0.1.3 2020-08-04 [1] CRAN (R 4.0.2) #> pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.2) #> pkgbuild 1.1.0 2020-07-13 [1] CRAN (R 4.0.2) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.2) #> pkgload 1.1.0 2020-05-29 [1] CRAN (R 4.0.2) #> plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.2) #> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.2) #> pROC 1.16.2 2020-03-19 [1] CRAN (R 4.0.2) #> processx 3.4.3 2020-07-05 [1] CRAN (R 4.0.2) #> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.0.2) #> ps 1.3.4 2020-08-11 [1] CRAN (R 4.0.2) #> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.2) #> R6 2.4.1 2019-11-12 [1] CRAN (R 4.0.2) #> Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.2) #> readr * 1.3.1 2018-12-21 [1] CRAN (R 4.0.2) #> readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.2) #> recipes * 0.1.13 2020-06-23 [1] CRAN (R 4.0.2) #> remotes 2.2.0 2020-07-21 [1] CRAN (R 4.0.2) #> reprex 0.3.0 2019-05-16 [1] CRAN (R 4.0.2) #> rlang 0.4.7 2020-07-09 [1] CRAN (R 4.0.2) #> rmarkdown 2.3 2020-06-18 [1] CRAN (R 4.0.2) #> rpart 4.1-15 2019-04-12 [4] CRAN (R 4.0.0) #> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.2) #> rsample * 0.0.7 2020-06-04 [1] CRAN (R 4.0.2) #> rstudioapi 0.11 2020-02-07 [1] CRAN (R 4.0.2) #> rvest 0.3.6 2020-07-25 [1] CRAN (R 4.0.2) #> scales * 1.1.1 2020-05-11 [1] CRAN (R 4.0.2) #> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.2) #> stringi 1.4.6 2020-02-17 [1] CRAN (R 4.0.2) #> stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.2) #> survival 3.1-12 2020-04-10 [4] CRAN (R 4.0.0) #> testthat 2.3.2 2020-03-02 [1] CRAN (R 4.0.2) #> tibble * 3.0.3 2020-07-10 [1] CRAN (R 4.0.2) #> tidymodels * 0.1.1 2020-07-14 [1] CRAN (R 4.0.2) #> tidyr * 1.1.1 2020-07-31 [1] CRAN (R 4.0.2) #> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.2) #> tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 4.0.2) #> timeDate 3043.102 2018-02-21 [1] CRAN (R 4.0.2) #> tune * 0.1.1 2020-07-08 [1] CRAN (R 4.0.2) #> usethis 1.6.1 2020-04-29 [1] CRAN (R 4.0.2) #> vctrs 0.3.2 2020-07-15 [1] CRAN (R 4.0.2) #> withr 2.2.0 2020-04-20 [1] CRAN (R 4.0.2) #> workflows * 0.1.3 2020-08-10 [1] CRAN (R 4.0.2) #> xfun 0.16 2020-07-24 [1] CRAN (R 4.0.2) #> xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.2) #> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.2) #> yardstick * 0.0.7 2020-07-13 [1] CRAN (R 4.0.2) #> #> [1] /home/andrew/R/x86_64-pc-linux-gnu-library/4.0 #> [2] /usr/local/lib/R/site-library #> [3] /usr/lib/R/site-library #> [4] /usr/lib/R/library ```

Apologies for the reprex overload, but I hope it's helpful! Thanks!

DavisVaughan commented 4 years ago

Hmm, I'm not sure what is happening here.

Is there any way you can simplify this any to help isolate the problem? It is a little hard to debug as is.

I tried it with less v-folds and with a scaled back grid in tune_with_penalty() and it works for me with multisession. It took too long without scaling it back some. Specifically I tried the code in the collapsed section below. Does that work for you?

```r library(tidyverse) library(tidymodels) library(furrr) dat = cbind(structure(rnorm(200*16),dim=c(200,16)), structure(sample(0:1,200*14,replace = T),dim=c(200,14))) %>% data.frame() %>% setNames(c("y",paste0("x",1:29))) nestcv_dat <- nested_cv(dat, outside = vfold_cv(v = 4, repeats = 2), inside = vfold_cv(v = 10, repeats = 2)) lasso_rmse <- function(penalty,mixture,object) { mod <- linear_reg(mode = "regression", penalty = penalty,mixture=mixture) %>% set_engine("glmnet") %>% fit(y ~ ., data = analysis(object)) holdout_pred = predict(mod, assessment(object) %>% dplyr::select(-y)) %>% bind_cols(assessment(object) %>% dplyr::select(y),penalty=penalty) rmse(holdout_pred, truth = y, estimate = .pred)$.estimate } tune_over_penalty <- function(object) { tmp_fun = function(i,pen,mix,object){lasso_rmse(pen[i],mix[i],object)} tibble(expand.grid(penalty = seq(0.01,5,length=3),mixture = seq(0,1,0.5))) %>% mutate(RMSE = sapply(1:nrow(.),tmp_fun,penalty, mixture, object)) } summarize_tune_results <- function(object) { map_df(object$splits, tune_over_penalty) %>% group_by(penalty,mixture) %>% summarize(mean_RMSE = mean(RMSE, na.rm = TRUE), n = length(RMSE), .groups = "drop") } plan(multisession, workers = 4) tuning_results <- future_map(nestcv_dat$inner_resamples, summarize_tune_results) plan(sequential) ```

You might also try updating to the dev version of furrr. I made a number of changes there, however none of them would be specific to this issue.

DavisVaughan commented 4 years ago

I'm going to close this issue, as I'm not currently sure how to advise without more information