tidymodels / finetune

Additional functions for model tuning
https://finetune.tidymodels.org/
Other
62 stars 8 forks source link

Error using tune_sim_anneal and tune_race_anova for custom metric function #73

Closed ruddnr closed 9 months ago

ruddnr commented 1 year ago

Hi. I encountered an error using finetune. It happens when I try to set options for certain metrics. I used the same function in the document for metric_set. The metric function works fine in tune_grid, but it fails when I try to use tune_sim_anneal and tune_race_anova. Thanks in advance!

library(tidymodels)
library(finetune)
data(ames)

ames <- mutate(ames, Sale_Price = log10(Sale_Price))

set.seed(502)
ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test  <-  testing(ames_split)
ames_folds <- vfold_cv(ames_train, v = 10)

ames_rec <-
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type +
           Latitude + Longitude, data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.01) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>%
  step_ns(Latitude, Longitude, deg_free = 20)

rf_model <-
  rand_forest(trees = tune()) %>%
  # rand_forest(trees = 1000) %>%
  set_engine("ranger") %>%
  set_mode("regression")

rf_wflow <-
  workflow() %>%
  add_formula(
    Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type +
      Latitude + Longitude) %>%
  add_model(rf_model)

grid <- parameters(trees(c(10, 100))) %>% 
  grid_max_entropy(size = 10)

ccc_with_bias <- function(data, truth, estimate, na_rm = TRUE, ...) {
  ccc(
    data = data,
    truth = !!rlang::enquo(truth),
    estimate = !!rlang::enquo(estimate),
    # set bias = TRUE
    bias = TRUE,
    na_rm = na_rm,
    ...
  )
}

# Use `new_numeric_metric()` to formalize this new metric function
ccc_with_bias <- new_numeric_metric(ccc_with_bias, "maximize")

model_metric <- metric_set(ccc_with_bias)

tune_res <- tune_grid(
  rf_wflow,
  ames_folds,
  grid = grid,
  metrics = model_metric
)

tune_res_anova <- tune_race_anova(
  rf_wflow,
  ames_folds,
  grid = grid,
  metrics = model_metric
)
#> Warning in max(best_config$B, na.rm = TRUE): no non-missing arguments to max;
#> returning -Inf
#> Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels

tune_res_anneal <- tune_sim_anneal(
  rf_wflow,
  ames_folds,
  metrics = model_metric
)
#> Optimizing ccc_with_bias
#> Warning in max(which(x$global_best)): no non-missing arguments to max;
#> returning -Inf
#> Warning in max(x$.iter): no non-missing arguments to max; returning -Inf
#> Warning in max(x$mean[x$.iter == 0], na.rm = TRUE): no non-missing arguments to
#> max; returning -Inf
#> Initial best: -Inf
#> Error in 1:prev_ind: argument of length 0
#> ✖ Optimization stopped prematurely; returning current results.

Created on 2023-05-23 with reprex v2.0.2

EmilHvitfeldt commented 1 year ago

You are getting this error because your custom metric ccc_with_bias() returned a tibble with .metric value of ccc where it should have returned a value of ccc_with_bias().

# What it returns
ccc_with_bias(solubility_test, solubility, prediction)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 ccc     standard       0.937
# What it should return
ccc_with_bias(solubility_test, solubility, prediction)
#> # A tibble: 1 × 3
#>   .metric       .estimator .estimate
#>   <chr>         <chr>          <dbl>
#> 1 ccc_with_bias standard       0.937

I modified ccc_with_bias() for you, and now it works as it should.

library(tidymodels)
library(finetune)
data(ames)

ames <- mutate(ames, Sale_Price = log10(Sale_Price))

set.seed(502)
ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test  <-  testing(ames_split)
ames_folds <- vfold_cv(ames_train, v = 10)

ames_rec <-
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type +
           Latitude + Longitude, data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>%
  step_other(Neighborhood, threshold = 0.01) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>%
  step_ns(Latitude, Longitude, deg_free = 20)

rf_model <-
  rand_forest(trees = tune()) %>%
  # rand_forest(trees = 1000) %>%
  set_engine("ranger") %>%
  set_mode("regression")

rf_wflow <-
  workflow() %>%
  add_formula(
    Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type +
      Latitude + Longitude) %>%
  add_model(rf_model)

grid <- parameters(trees(c(10, 100))) %>% 
  grid_max_entropy(size = 10)

ccc_with_bias <- function(data, truth, estimate, na_rm = TRUE, case_weights = NULL, ...) {
  res <- ccc(
    data = data,
    truth = !!rlang::enquo(truth),
    estimate = !!rlang::enquo(estimate),
    # set bias = TRUE
    bias = FALSE,
    na_rm = na_rm,
    case_weights = !!rlang::enquo(case_weights),
    ...
  )
  res$.metric <- "ccc_with_bias"
  res
}

# Use `new_numeric_metric()` to formalize this new metric function
ccc_with_bias <- new_numeric_metric(ccc_with_bias, "maximize")

model_metric <- metric_set(ccc_with_bias)

tune_res_anova <- tune_race_anova(
  rf_wflow,
  ames_folds,
  grid = grid,
  metrics = model_metric
)

Created on 2023-05-26 with reprex v2.0.2

simonpcouch commented 9 months ago

As this issue hasn't seen any activity in a while, I'm going to go ahead and close. Thanks for the issue!