tidymodels / tune

Tools for tidy parameter tuning
https://tune.tidymodels.org
Other
276 stars 42 forks source link

Inconsistent handling of nonlinear formula by parsnip::fit() and tune::fit_resamples() #534

Closed misken closed 11 months ago

misken commented 2 years ago

Reconcile handling of nonlinear formulas by parsnip::fit() and tune::fit_resamples()

Not sure if this is a feature or bug or neither. Created a custom parsnip model for nonlinear regression using nls as the engine (followed https://www.tidymodels.org/learn/develop/models/). Fit a simple power function $b1 x^{b2}$ with parsnip::fit(). Works. Wanted to use same model with k-crossfold validation. However, tune::fit_resamples() gives error saying formula contains invalid power (I imagine because it's wanting a constant power not a parameter to be estimated). Tried working around it with recipes but it also doesn't like the nonlinear formula and gives error saying that inline functions are not allowed (I'm assuming it's the ^ operator it is objecting to). If there's no way to bypass the formula checking in tune::fit_resamples(), seems like the workaround is to manually do the looping through the crossfold split object using parsnip::fit() to do model fitting and then add code for assessment of model fit. A reprex is below.

# libraries
library(tidymodels)

# Create synthetic data
x <- seq(1, 12)
epsilon <- rnorm(12)
a <- 5
b <- 1.5

y <- a * x ^ b + epsilon

df <- data.frame(x, y)

# 
# Set up resampling
# 
set.seed(57)
kfold_number <- 5
kfold_repeats <- 1

splits <- vfold_cv(df, v = kfold_number, repeats = kfold_repeats)

# 
# Register new parsnip model based on nls()
# 
set_new_model("nonlinear_reg")
set_model_mode(model = "nonlinear_reg", mode = "regression")
set_model_engine(
  "nonlinear_reg", 
  mode = "regression", 
  eng = "nls"
)
set_dependency("nonlinear_reg", eng = "nls", pkg = "stats")

# 
# Create the model function and add fit method and encoding details
# 
nonlinear_reg <-
  function(mode = "regression",  start = NULL) {
    # Check for correct mode
    if (mode  != "regression") {
      rlang::abort("`mode` should be 'regression'")
    }

    # Capture the arguments in quosures
    args <- list(start = rlang::enquo(start))

    # Save some empty slots for future parts of the specification
    new_model_spec(
      "nonlinear_reg",
      args = args,
      mode = mode,
      engine = NULL,
      eng_args = NULL,
      method = NULL
    )
  }

set_fit(
  model = "nonlinear_reg",
  eng = "nls",
  mode = "regression",
  value = list(
    interface = "formula",
    protect = c("formula", "data"),
    func = c(pkg = "stats", fun = "nls"),
    defaults = list()
  )
)

set_encoding(
  model = "nonlinear_reg",
  eng = "nls",
  mode = "regression",
  options = list(
    predictor_indicators = "none",
    compute_intercept = FALSE,
    remove_intercept = FALSE,
    allow_sparse_x = FALSE
  )
)

# 
# Add modules for prediction
# 
response_info <- 
  list(
    pre = NULL,
    post = NULL,
    func = c(fun = "predict"),
    args =
      # These lists should be of the form:
      # {predict.nls argument name} = {values provided from parsnip objects}
      list(
        # We don't want the first two arguments evaluated right now
        # since they don't exist yet. 
        object = quote(object$fit),
        newdata = quote(new_data)
      )
  )

set_pred(
  model = "nonlinear_reg",
  eng = "nls",
  mode = "regression",
  type = "numeric",
  value = response_info
)

# 
# Show that the new nls based parsnip model works if we call fit with 
# entire data set or with some subset generated by vfold_cv()

# Need starting values for nls()
init_nls <- c(b1=1, b2=2)
nls_mod <- nonlinear_reg() %>% set_engine(engine = "nls", start = init_nls)

# Call fit with a specific model formula - a power function

nls_mod_formula <- y ~ b1 * (x ^ b2) 

nls_mod_fit <- 
  nls_mod %>% 
  fit(formula = nls_mod_formula, data = df)

nls_mod_fit
#> parsnip model object
#> 
#> Nonlinear regression model
#>   model: y ~ b1 * (x^b2)
#>    data: data
#>    b1    b2 
#> 5.042 1.495 
#>  residual sum-of-squares: 8.048
#> 
#> Number of iterations to convergence: 6 
#> Achieved convergence tolerance: 3.56e-06

# Things also work with some arbitrary split

nls_mod_fit2 <- 
  nls_mod %>% 
  fit(formula = nls_mod_formula, data = as.data.frame(splits$splits[[1]]))

nls_mod_fit2
#> parsnip model object
#> 
#> Nonlinear regression model
#>   model: y ~ b1 * (x^b2)
#>    data: data
#>    b1    b2 
#> 5.052 1.495 
#>  residual sum-of-squares: 7.496
#> 
#> Number of iterations to convergence: 6 
#> Achieved convergence tolerance: 4.676e-06

# 
# Now try to use the new parsnip model with resampling. It seems that
# tune::fit_resamples() does some formula checking that parsnip::fit()
# does not and tune::fit_resamples() does not like nonlinear models such as power functions
# in which one of the parameters to estimate is a power. I thought
# I might be able to use a recipe to get around this but the recipes
# package ends up with a similar error in that it thinks the formula
# uses inline functions (it does, it uses ^).
# 
# I can't see a way to bypass this formula checking in fit_resamples(). 
# The other alternative is to manually do the looping through the 
# splits object to fit models and assess performance with resampling.
# 

# Pipe our model through fit_resamples to see the error
nls_mod %>% 
  fit_resamples(preprocessor = nls_mod_formula, resamples = splits)
#> x Fold1: preprocessor 1/1: Error in terms.formula(formula, data = data): invalid power in formula
#> x Fold2: preprocessor 1/1: Error in terms.formula(formula, data = data): invalid power in formula
#> x Fold3: preprocessor 1/1: Error in terms.formula(formula, data = data): invalid power in formula
#> x Fold4: preprocessor 1/1: Error in terms.formula(formula, data = data): invalid power in formula
#> x Fold5: preprocessor 1/1: Error in terms.formula(formula, data = data): invalid power in formula
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.
#> # Resampling results
#> # 5-fold cross-validation 
#> # A tibble: 5 × 4
#>   splits         id    .metrics .notes          
#>   <list>         <chr> <list>   <list>          
#> 1 <split [9/3]>  Fold1 <NULL>   <tibble [1 × 3]>
#> 2 <split [9/3]>  Fold2 <NULL>   <tibble [1 × 3]>
#> 3 <split [10/2]> Fold3 <NULL>   <tibble [1 × 3]>
#> 4 <split [10/2]> Fold4 <NULL>   <tibble [1 × 3]>
#> 5 <split [10/2]> Fold5 <NULL>   <tibble [1 × 3]>
#> 
#> There were issues with some computations:
#> 
#>   - Error(s) x5: Error in terms.formula(formula, data = data): invalid power in fo...
#> 
#> Run `show_notes(.Last.tune.result)` for more information.

show_notes(.Last.tune.result)
#> unique notes:
#> ──────────────────────────────────────────────────────────────────────
#> Error in terms.formula(formula, data = data): invalid power in formula

# We would get same error if we used a workflow object
# nls_wflow <- 
#   workflow() %>% 
#   add_model(nls_mod) %>% 
#   add_formula(nls_mod_formula)
# 
# # Pipe our workflow through fit_resamples
# nls_wflow %>% 
#   fit_resamples(resamples = splits)

Created on 2022-08-15 by the reprex package (v2.0.1)

misken commented 2 years ago

In addition, even using parsnip::fit() (i.e. no resampling) within a workflow containing add_formula also leads to the "invalid power in formula" error. But as shown in the reprex above, parsnip::fit(formula = some_nonlinear_formula, data = some_data) will work just fine.

simonpcouch commented 11 months ago

Thanks for the issue, @misken, and apologies for the delay!

The trick here is to use the formula argument of add_model() in workflows. From that package's docs:

formula: An optional formula override to specify the terms of the model. Typically, the terms are extracted from the formula or recipe preprocessing methods. However, some models (like survival and bayesian models) use the formula not to preprocess, but to specify the structure of the model. In those cases, a formula specifying the model structure must be passed unchanged into the model call itself. This argument is used for those purposes.

So, pass the formula that specifies the variables of interest (e.g. y ~ x) as the preprocessor formula, and the other as the "model formula." In your example:

# libraries
library(tidymodels)

# Create synthetic data
x <- seq(1, 12)
epsilon <- rnorm(12)
a <- 5
b <- 1.5

y <- a * x ^ b + epsilon

df <- data.frame(x, y)

# 
# Set up resampling
# 
set.seed(57)
kfold_number <- 5
kfold_repeats <- 1

splits <- vfold_cv(df, v = kfold_number, repeats = kfold_repeats)

# 
# Register new parsnip model based on nls()
# 
set_new_model("nonlinear_reg")
set_model_mode(model = "nonlinear_reg", mode = "regression")
set_model_engine(
  "nonlinear_reg", 
  mode = "regression", 
  eng = "nls"
)
set_dependency("nonlinear_reg", eng = "nls", pkg = "stats")

# 
# Create the model function and add fit method and encoding details
# 
nonlinear_reg <-
  function(mode = "regression",  start = NULL) {
    # Check for correct mode
    if (mode  != "regression") {
      rlang::abort("`mode` should be 'regression'")
    }

    # Capture the arguments in quosures
    args <- list(start = rlang::enquo(start))

    # Save some empty slots for future parts of the specification
    new_model_spec(
      "nonlinear_reg",
      args = args,
      mode = mode,
      engine = NULL,
      eng_args = NULL,
      method = NULL
    )
  }

set_fit(
  model = "nonlinear_reg",
  eng = "nls",
  mode = "regression",
  value = list(
    interface = "formula",
    protect = c("formula", "data"),
    func = c(pkg = "stats", fun = "nls"),
    defaults = list()
  )
)

set_encoding(
  model = "nonlinear_reg",
  eng = "nls",
  mode = "regression",
  options = list(
    predictor_indicators = "none",
    compute_intercept = FALSE,
    remove_intercept = FALSE,
    allow_sparse_x = FALSE
  )
)

# 
# Add modules for prediction
# 
response_info <- 
  list(
    pre = NULL,
    post = NULL,
    func = c(fun = "predict"),
    args =
      # These lists should be of the form:
      # {predict.nls argument name} = {values provided from parsnip objects}
      list(
        # We don't want the first two arguments evaluated right now
        # since they don't exist yet. 
        object = quote(object$fit),
        newdata = quote(new_data)
      )
  )

set_pred(
  model = "nonlinear_reg",
  eng = "nls",
  mode = "regression",
  type = "numeric",
  value = response_info
)

# 
# Show that the new nls based parsnip model works if we call fit with 
# entire data set or with some subset generated by vfold_cv()

# Need starting values for nls()
init_nls <- c(b1=1, b2=2)
nls_mod <- nonlinear_reg() %>% set_engine(engine = "nls", start = init_nls)

# Call fit with a specific model formula - a power function

nls_mod_formula <- y ~ b1 * (x ^ b2) 

nls_wf <-
  workflow() %>%
  add_formula(y ~ x) %>%
  add_model(nls_mod, formula = nls_mod_formula)

# Pipe our model through fit_resamples to see the error
fit_resamples(nls_wf, resamples = splits)
#> # Resampling results
#> # 5-fold cross-validation 
#> # A tibble: 5 × 4
#>   splits         id    .metrics         .notes          
#>   <list>         <chr> <list>           <list>          
#> 1 <split [9/3]>  Fold1 <tibble [2 × 4]> <tibble [0 × 3]>
#> 2 <split [9/3]>  Fold2 <tibble [2 × 4]> <tibble [0 × 3]>
#> 3 <split [10/2]> Fold3 <tibble [2 × 4]> <tibble [0 × 3]>
#> 4 <split [10/2]> Fold4 <tibble [2 × 4]> <tibble [0 × 3]>
#> 5 <split [10/2]> Fold5 <tibble [2 × 4]> <tibble [0 × 3]>

Created on 2023-10-31 with reprex v2.0.2

github-actions[bot] commented 10 months ago

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.