tidymodels / tune

Tools for tidy parameter tuning
https://tune.tidymodels.org
Other
285 stars 42 forks source link

`fit_resample()` works for KNN, LM, SVM, etc., but not glmnet #386

Closed PathosEthosLogos closed 3 years ago

PathosEthosLogos commented 3 years ago
library(tidyverse)
library(rsample)
library(lubridate)
library(tidymodels)
library(stacks)
library(baguette)

set.seed(1)

df <- data.frame(y  = sample(5000000:120000000, 1000, replace = TRUE),
                 yearr = sample(2015:2021, 1000, replace = TRUE),
                 monthh = sample(1:12, 1000, replace = TRUE),
                 dayy = sample(1:29, 1000, replace = TRUE)) |>
  mutate(weekk = week(ymd(paste(yearr, monthh, dayy))),
         datee = ymd(paste(yearr, monthh, dayy))) |>
  filter(!is.na(datee)) |>
  arrange(-desc(datee))

folds = df |>
  sliding_period(lookback = Inf, # if Inf, then it's chain
                 assess_stop = 2,
                 index = datee,
                 period = 'week',
                 every = 15)

rec_df = recipe(y ~ ., data = df)

metric = metric_set(rmse) # mae, or accuracy and roc_auc for classifications

ctrl_grid = control_stack_grid()
ctrl_res = control_stack_resamples()

# Elastic net
spec_enet =
  linear_reg(mode = 'regression', penalty = 1, mixture = 0.5) |>
  set_engine('glmnet')

rec_enet = rec_df |> # Recipe
  #step_dummy(all_nominal()) |>
  step_zv(all_numeric(), all_outcomes()) |>
  step_normalize(all_numeric(), -all_outcomes())
#step_zv(all_predictors(), skip = TRUE) |>
#step_normalize(all_numeric(), skip = TRUE)

# add both to a workflow
wf_enet = workflow() |>
  add_model(spec_enet) |>
  add_recipe(rec_enet)

# Tune THIS OR
#resample_enet = fit_resamples(wf_enet,
#                              resamples = folds,
#                              grid = 4,
#                              metrics = metric,
#                              control = ctrl_grid)

# fit to the 5-fold cv OR THIS
resample_enet = fit_resamples(wf_enet,
                              resamples = folds,
                              metrics = metric,
                              control = ctrl_res)

Error output for first resample_enet:

x Slice01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, weights, offset, type.gaussian, : NA/NaN/Inf ...
x Slice02: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, weights, offset, type.gaussian, : NA/NaN/Inf ...
x Slice03...
Warning message:
All models failed. See the `.notes` column. 

Very similar code is used for KNN, LM, SVM, etc., but it just seems to fail for glmnet. What gives?

juliasilge commented 3 years ago

The "glmnet" engine requires all predictors to be numeric, so the datee predictor caused the error. You can remove it (in the recipe as I've shown here, or via another method) so that the model is able to fit.

library(tidyverse)
library(rsample)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip
library(stacks)
library(baguette)

set.seed(1)

df <- data.frame(y  = sample(5000000:120000000, 1000, replace = TRUE),
                 yearr = sample(2015:2021, 1000, replace = TRUE),
                 monthh = sample(1:12, 1000, replace = TRUE),
                 dayy = sample(1:29, 1000, replace = TRUE)) |>
   mutate(weekk = week(ymd(paste(yearr, monthh, dayy))),
          datee = ymd(paste(yearr, monthh, dayy))) |>
   filter(!is.na(datee)) |>
   arrange(-desc(datee))
#> Warning: 2 failed to parse.

#> Warning: 2 failed to parse.

folds = df |>
   sliding_period(lookback = Inf, # if Inf, then it's chain
                  assess_stop = 2,
                  index = datee,
                  period = 'week',
                  every = 15)

rec_df = recipe(y ~ ., data = df)

metric = metric_set(rmse) # mae, or accuracy and roc_auc for classifications

# Elastic net
spec_enet =
   linear_reg(mode = 'regression', penalty = 1, mixture = 0.5) |>
   set_engine('glmnet')

rec_enet = rec_df |> # Recipe
   step_zv(all_numeric(), all_outcomes()) |>
   step_normalize(all_numeric(), -all_outcomes()) %>%
   step_rm(datee)

# add both to a workflow
wf_enet = workflow() |>
   add_model(spec_enet) |>
   add_recipe(rec_enet)

resample_enet = fit_resamples(wf_enet,
                              resamples = folds,
                              metrics = metric)

resample_enet
#> # Resampling results
#> # Sliding period resampling 
#> # A tibble: 23 x 4
#>    splits           id      .metrics         .notes          
#>    <list>           <chr>   <list>           <list>          
#>  1 <split [29/75]>  Slice01 <tibble [1 × 4]> <tibble [0 × 1]>
#>  2 <split [67/74]>  Slice02 <tibble [1 × 4]> <tibble [0 × 1]>
#>  3 <split [104/77]> Slice03 <tibble [1 × 4]> <tibble [0 × 1]>
#>  4 <split [141/88]> Slice04 <tibble [1 × 4]> <tibble [0 × 1]>
#>  5 <split [181/98]> Slice05 <tibble [1 × 4]> <tibble [0 × 1]>
#>  6 <split [229/89]> Slice06 <tibble [1 × 4]> <tibble [0 × 1]>
#>  7 <split [279/76]> Slice07 <tibble [1 × 4]> <tibble [0 × 1]>
#>  8 <split [318/75]> Slice08 <tibble [1 × 4]> <tibble [0 × 1]>
#>  9 <split [355/80]> Slice09 <tibble [1 × 4]> <tibble [0 × 1]>
#> 10 <split [393/87]> Slice10 <tibble [1 × 4]> <tibble [0 × 1]>
#> # … with 13 more rows

Created on 2021-06-24 by the reprex package (v2.0.0)

PathosEthosLogos commented 3 years ago

Oh I see, thanks -- seems like it also works with update_role(datee, new_role = 'ID') in recipe. Closed

github-actions[bot] commented 3 years ago

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.