DavisVaughan / furrr

Apply Mapping Functions in Parallel using Futures
https://furrr.futureverse.org/
Other
695 stars 39 forks source link

Multiple calls of future_map() within a single plan() or script result in massive slowdown. #268

Open padpadpadpad opened 2 months ago

padpadpadpad commented 2 months ago

I have multiple datasets in a nested tibble, and I am using multiple future_map() calls to fit different non-linear model formulations to each dataset.

For a couple of models it is quicker to use future_map() than map(), but I have found that it becomes very slow (and essentially hangs) when I get to about 7 models.

I have attached a reproducible furrr vs. purrr comparison where furrr outperforms purrr, but when I add more calls furrr basically hangs by the 6 or 7th call or model.

I think it is a memory issue, but I was wondering if there are any recommendations to improving memory usage for multiple future_map() calls.

I tried to look at whether individual models were causing a bottleneck by doing each future_map() call individually within a plan() and using plan(sequential) to close it afterwards, but I still got a huge slowdown in future_map() performance after 6 or 7 calls.

Also I have checked the nested dataframe is not grouped as I know that is a known issue.

Many thanks Dan


# ---------------------------
# Purpose of script: Compare future_map() and map()
#
# What this script does:
# 1. Compare furrr and purrr for a small dataset
#
# Author: Dr. Daniel Padfield
#
# Date Created: 2024-05-29
#
# Copyright (c) Daniel Padfield, 2024
#
# ---------------------------
#
# Notes:
#
# ---------------------------

# if librarian is not installed, install it
if (!requireNamespace("librarian", quietly = TRUE)){
  install.packages("librarian")
}
# load packages
librarian::shelf(tidyverse, rTPC, nls.multstart, furrr, progressr, microbenchmark)

## ---------------------------

# read in Chlorella TPC data
data("chlorella_tpc")

# fit a few models to the data

# load in data
data("chlorella_tpc")

d <- chlorella_tpc

# compare future_map with map ####

# compare using nls.multstart with 2 models and 10 curves
check_purrr <- microbenchmark(
  purrr = filter(d, curve_id <= 10) %>%
    nest(., data = c(temp, rate)) %>%
    mutate(beta = map(data, possibly(~nls_multstart(rate~beta_2012(temp = temp, a, b, c, d, e),
                                                    data = .x,
                                                    iter = c(6,6,6,6,6),
                                                    start_lower = get_start_vals(.x$temp, .x$rate, model_name = 'beta_2012') * 0.5,
                                                    start_upper = get_start_vals(.x$temp, .x$rate, model_name = 'beta_2012') * 1.5,
                                                    lower = get_lower_lims(.x$temp, .x$rate, model_name = 'beta_2012'),
                                                    upper = get_upper_lims(.x$temp, .x$rate, model_name = 'beta_2012'),
                                                    supp_errors = 'Y',
                                                    convergence_count = FALSE)), NA),
           boatman = map(data, possibly(~nls_multstart(rate~boatman_2017(temp = temp, rmax, tmin, tmax, a,b),
                                                       data = .x,
                                                       iter = c(5,5,5,5,5),
                                                       start_lower = get_start_vals(.x$temp, .x$rate, model_name = 'boatman_2017') * 0.5,
                                                       start_upper = get_start_vals(.x$temp, .x$rate, model_name = 'boatman_2017') * 1.5,
                                                       lower = get_lower_lims(.x$temp, .x$rate, model_name = 'boatman_2017'),
                                                       upper = get_upper_lims(.x$temp, .x$rate, model_name = 'boatman_2017'),
                                                       supp_errors = 'Y',
                                                       convergence_count = FALSE)), NA)),
  times = 1
)

check_purrr

check_furrr <- microbenchmark(
  furrr = {
  plan(multisession, workers = 3)

  filter(d, curve_id <= 10) %>%
    nest(., data = c(temp, rate)) %>%
    mutate(beta = future_map(data, possibly(~nls_multstart(rate~beta_2012(temp = temp, a, b, c, d, e),
                                                                     data = .x,
                                                                     iter = c(6,6,6,6,6),
                                                                     start_lower = get_start_vals(.x$temp, .x$rate, model_name = 'beta_2012') * 0.5,
                                                                     start_upper = get_start_vals(.x$temp, .x$rate, model_name = 'beta_2012') * 1.5,
                                                                     lower = get_lower_lims(.x$temp, .x$rate, model_name = 'beta_2012'),
                                                                     upper = get_upper_lims(.x$temp, .x$rate, model_name = 'beta_2012'),
                                                                     supp_errors = 'Y',
                                                                     convergence_count = FALSE, p = p)), NA),
           boatman = future_map(data, possibly(~nls_multstart(rate~boatman_2017(temp = temp, rmax, tmin, tmax, a,b),
                                                                        data = .x,
                                                                        iter = c(5,5,5,5,5),
                                                                        start_lower = get_start_vals(.x$temp, .x$rate, model_name = 'boatman_2017') * 0.5,
                                                                        start_upper = get_start_vals(.x$temp, .x$rate, model_name = 'boatman_2017') * 1.5,
                                                                        lower = get_lower_lims(.x$temp, .x$rate, model_name = 'boatman_2017'),
                                                                        upper = get_upper_lims(.x$temp, .x$rate, model_name = 'boatman_2017'),
                                                                        supp_errors = 'Y',
                                                                        convergence_count = FALSE, p = p)), NA))},
  times = 1
)

check_furrr
check_purrr