spsanderson / healthyR.ts

A time-series companion package to healthyR
https://www.spsanderson.com/healthyR.ts/
Other
19 stars 3 forks source link

svm_poly() #281

Closed spsanderson closed 2 years ago

spsanderson commented 2 years ago

Function:

#' Boilerplate Workflow
#'
#' @family Boiler_Plate
#' @family SVM
#' @family poly
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This uses `parsnip::svm_poly()` and sets the `parsnip::engine`
#' to `kernlab`.
#'
#' @seealso \url{https://parsnip.tidymodels.org/reference/svm_poly.html}
#'
#' @description This is a boilerplate function to automatically create the following:
#' -  recipe
#' -  model specification
#' -  workflow
#' -  tuned model (grid ect)
#' -  calibration tibble and plot
#'
#' @param .data The data being passed to the function. The time-series object.
#' @param .date_col The column that holds the datetime.
#' @param .value_col The column that has the value
#' @param .formula The formula that is passed to the recipe like `value ~ .`
#' @param .rsamp_obj The rsample splits object
#' @param .prefix Default is `ts_smooth_es`
#' @param .tune Defaults to TRUE, this creates a tuning grid and tuned model.
#' @param .grid_size If `.tune` is TRUE then the `.grid_size` is the size of the
#' tuning grid.
#' @param .num_cores How many cores do you want to use. Default is 1
#' @param .cv_assess How many observations for assess. See [timetk::time_series_cv()]
#' @param .cv_skip How many observations to skip. See [timetk::time_series_cv()]
#' @param .cv_slice_limit How many slices to return. See [timetk::time_series_cv()]
#' @param .best_metric Default is "rmse". See [modeltime::default_forecast_accuracy_metric_set()]
#' @param .bootstrap_final Not yet implemented.
#'
#' @examples
#' \dontrun{
#' library(dplyr)
#'
#' data <- AirPassengers %>%
#'   ts_to_tbl() %>%
#'   select(-index)
#'
#' splits <- time_series_split(
#'   data
#'   , date_col
#'   , assess = 12
#'   , skip = 3
#'   , cumulative = TRUE
#' )
#'
#' ts_auto_poly <- ts_auto_svm_poly(
#'   .data = data,
#'   .num_cores = 5,
#'   .date_col = date_col,
#'   .value_col = value,
#'   .rsamp_obj = splits,
#'   .formula = value ~ .,
#'   .grid_size = 3
#' )
#'
#' ts_smooth_poly$recipe_info
#' }
#'
#' @return
#' A list
#'
#' @export
#'

ts_auto_svm_poly <- function(.data, .date_col, .value_col, .formula, .rsamp_obj,
                              .prefix = "ts_svm_poly", .tune = TRUE, .grid_size = 10,
                              .num_cores = 1, .cv_assess = 12, .cv_skip = 3,
                              .cv_slice_limit = 6, .best_metric = "rmse",
                              .bootstrap_final = FALSE){

    # Tidyeval ----
    date_col_var_expr <- rlang::enquo(.date_col)
    value_col_var_expr <- rlang::enquo(.value_col)
    sampling_object <- .rsamp_obj

    # Cross Validation
    cv_assess = as.numeric(.cv_assess)
    cv_skip   = as.numeric(.cv_skip)
    cv_slice  = as.numeric(.cv_slice_limit)

    # Tuning Grid
    grid_size <- as.numeric(.grid_size)
    num_cores <- as.numeric(.num_cores)
    best_metric <- as.character(.best_metric)

    # Data and splits
    splits <- .rsamp_obj
    data_tbl <- dplyr::as_tibble(.data)

    # Checks ----
    if (rlang::quo_is_missing(date_col_var_expr)){
        rlang::abort(
            message = "'.date_col' must be supplied.",
            use_cli_format = TRUE
        )
    }

    if (rlang::quo_is_missing(value_col_var_expr)){
        rlang::abort(
            message = "'.value_col' must be supplied.",
            use_cli_format = TRUE
        )
    }

    if (!inherits(x = splits, what = "rsplit")){
        rlang::abort(
            message = "'.rsamp_obj' must be have class rsplit, use the rsample package.",
            use_cli_format = TRUE
        )
    }

    # Recipe ----
    # Get the initial recipe call
    recipe_call <- get_recipe_call(match.call())

    rec_syntax <- paste0(.prefix, "_recipe") %>%
        assign_value(!!recipe_call)

    rec_obj <- recipes::recipe(formula = .formula, data = data_tbl)

    rec_obj <- rec_obj %>%
        timetk::step_timeseries_signature({{date_col_var_expr}}) %>%
        timetk::step_holiday_signature({{date_col_var_expr}}) %>%
        recipes::step_novel(recipes::all_nominal_predictors()) %>%
        recipes::step_mutate_at(tidyselect::vars_select_helpers$where(is.character)
                                , fn = ~ as.factor(.)) %>%
        recipes::step_dummy(recipes::all_nominal(), one_hot = TRUE) %>%
        recipes::step_normalize(recipes::all_numeric_predictors(), -date_col_index.num) %>%
        recipes::step_nzv(recipes::all_predictors(), -date_col_index.num) %>%
        recipes::step_corr(recipes::all_numeric_predictors(), threshold = 0)

    # Tune/Spec ----
    if (.tune){
        model_spec <- parsnip::svm_poly(
            cost = tune::tune(),
            degree = tune::tune(),
            scale_factor = tune::tune(),
            margin = tune::tune()
        )
    } else {
        model_spec <- parsnip::svm_poly()
    }

    model_spec <- model_spec %>%
        parsnip::set_mode(mode = "regression") %>%
        parsnip::set_engine("kernlab")

    # Workflow ----
    wflw <- workflows::workflow() %>%
        workflows::add_recipe(rec_obj) %>%
        workflows::add_model(model_spec)

    # Tuning Grid ----
    if (.tune){

        tuning_grid_spec <- dials::grid_latin_hypercube(
            hardhat::extract_parameter_set_dials(model_spec),
            size = grid_size
        )

        # Make TS CV ----
        tscv <- timetk::time_series_cv(
            data        = rsample::training(splits),
            date_var    = {{date_col_var_expr}},
            cumulative  = TRUE,
            assess      = cv_assess,
            skip        = cv_skip,
            slice_limit = cv_slice
        )

        # Start parallel backend
        modeltime::parallel_start(num_cores)

        # Tune the workflow
        tuned_results <- wflw %>%
            tune::tune_grid(
                resamples = tscv,
                grid      = tuning_grid_spec,
                metrics   = modeltime::default_forecast_accuracy_metric_set()
            )

        # Stop parallel backend
        modeltime::parallel_stop()

        # Get the best result set by a specified metric
        best_result_set <- tuned_results %>%
            tune::show_best(metric = best_metric, n = 1)

        # Plot results
        tune_results_plt <- tuned_results %>%
            tune::autoplot() +
            ggplot2::theme_minimal() +
            ggplot2::geom_smooth(se = FALSE)

        # Make final workflow
        wflw_fit <- wflw %>%
            tune::finalize_workflow(
                tuned_results %>%
                    tune::show_best(metric = best_metric, n = Inf) %>%
                    dplyr::slice(1)
            ) %>%
            parsnip::fit(rsample::training(splits))

    } else {
        wflw_fit <- wflw %>%
            parsnip::fit(rsample::training(splits))
    }

    # Calibrate and Plot ----
    cap <- healthyR.ts::calibrate_and_plot(
        wflw_fit,
        .splits_obj  = splits,
        .data        = data_tbl,
        .interactive = TRUE,
        .print_info = FALSE
    )

    # Return ----
    output <- list(
        recipe_info = list(
            recipe_call   = recipe_call,
            recipe_syntax = rec_syntax,
            rec_obj       = rec_obj
        ),
        model_info = list(
            model_spec  = model_spec,
            wflw        = wflw,
            fitted_wflw = wflw_fit,
            was_tuned   = ifelse(.tune, "tuned", "not_tuned")
        ),
        model_calibration = list(
            plot = cap$plot,
            calibration_tbl = cap$calibration_tbl,
            model_accuracy = cap$model_accuracy
        )
    )

    if (.tune){
        output$tuned_info = list(
            tuning_grid      = tuning_grid_spec,
            tscv             = tscv,
            tuned_results    = tuned_results,
            grid_size        = grid_size,
            best_metric      = best_metric,
            best_result_set  = best_result_set,
            tuning_grid_plot = tune_results_plt,
            plotly_grid_plot = plotly::ggplotly(tune_results_plt)
        )
    }

    return(invisible(output))
}

Example:

> ts_auto_poly
$recipe_info
$recipe_info$recipe_call
recipe(.data = data, .date_col = date_col, .value_col = value, 
    .formula = value ~ ., .rsamp_obj = splits, .grid_size = 3, 
    .num_cores = 5)

$recipe_info$recipe_syntax
[1] "ts_svm_poly_recipe <-"                                                                                                                              
[2] "\n  recipe(.data = data, .date_col = date_col, .value_col = value, .formula = value ~ \n    ., .rsamp_obj = splits, .grid_size = 3, .num_cores = 5)"

$recipe_info$rec_obj
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          1

Operations:

Timeseries signature features from date_col
Holiday signature features from date_col
Novel factor level assignment for recipes::all_nominal_predictors()
Variable mutation for tidyselect::vars_select_helpers$where(is.character)
Dummy variables from recipes::all_nominal()
Centering and scaling for recipes::all_numeric_predictors(), -date_col_index.num
Sparse, unbalanced variable filter on recipes::all_predictors(), -date_col_index.num
Correlation filter on recipes::all_numeric_predictors()

$model_info
$model_info$model_spec
Polynomial Support Vector Machine Specification (regression)

Main Arguments:
  cost = tune::tune()
  degree = tune::tune()
  scale_factor = tune::tune()
  margin = tune::tune()

Computational engine: kernlab 

$model_info$wflw
== Workflow =========================================================================================
Preprocessor: Recipe
Model: svm_poly()

-- Preprocessor -------------------------------------------------------------------------------------
8 Recipe Steps

* step_timeseries_signature()
* step_holiday_signature()
* step_novel()
* step_mutate_at()
* step_dummy()
* step_normalize()
* step_nzv()
* step_corr()

-- Model --------------------------------------------------------------------------------------------
Polynomial Support Vector Machine Specification (regression)

Main Arguments:
  cost = tune::tune()
  degree = tune::tune()
  scale_factor = tune::tune()
  margin = tune::tune()

Computational engine: kernlab 

$model_info$fitted_wflw
== Workflow [trained] ===============================================================================
Preprocessor: Recipe
Model: svm_poly()

-- Preprocessor -------------------------------------------------------------------------------------
8 Recipe Steps

* step_timeseries_signature()
* step_holiday_signature()
* step_novel()
* step_mutate_at()
* step_dummy()
* step_normalize()
* step_nzv()
* step_corr()

-- Model --------------------------------------------------------------------------------------------
Support Vector Machine object of class "ksvm" 

SV type: eps-svr  (regression) 
 parameter : epsilon = 0.152480707721164  cost C = 16.5459568852013 

Polynomial kernel function. 
 Hyperparameters : degree =  2  scale =  0.00332334037030967  offset =  1 

Number of Support Vectors : 77 

Objective Function Value : -374.4418 
Training error : 0.161677 

$model_info$was_tuned
[1] "tuned"

$model_calibration
$model_calibration$plot

$model_calibration$calibration_tbl
# Modeltime Table
# A tibble: 1 x 5
  .model_id .model     .model_desc .type .calibration_data
      <int> <list>     <chr>       <chr> <list>           
1         1 <workflow> KERNLAB     Test  <tibble [12 x 4]>

$model_calibration$model_accuracy
# A tibble: 1 x 9
  .model_id .model_desc .type   mae  mape  mase smape  rmse    rsq
      <int> <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
1         1 KERNLAB     Test   62.6  11.7  1.30  12.9  88.2 0.0354

$tuned_info
$tuned_info$tuning_grid
# A tibble: 3 x 4
      cost degree scale_factor margin
     <dbl>  <int>        <dbl>  <dbl>
1  0.00121      3     1.95e- 5 0.0747
2  0.481        1     1.08e-10 0.0216
3 16.5          2     3.32e- 3 0.152 

$tuned_info$tscv
# Time Series Cross Validation Plan 
# A tibble: 6 x 2
  splits           id    
  <list>           <chr> 
1 <split [120/12]> Slice1
2 <split [117/12]> Slice2
3 <split [114/12]> Slice3
4 <split [111/12]> Slice4
5 <split [108/12]> Slice5
6 <split [105/12]> Slice6

$tuned_info$tuned_results
# Tuning results
# NA 
# A tibble: 6 x 4
  splits           id     .metrics          .notes          
  <list>           <chr>  <list>            <list>          
1 <split [120/12]> Slice1 <tibble [18 x 8]> <tibble [0 x 3]>
2 <split [117/12]> Slice2 <tibble [18 x 8]> <tibble [0 x 3]>
3 <split [114/12]> Slice3 <tibble [18 x 8]> <tibble [0 x 3]>
4 <split [111/12]> Slice4 <tibble [18 x 8]> <tibble [0 x 3]>
5 <split [108/12]> Slice5 <tibble [18 x 8]> <tibble [0 x 3]>
6 <split [105/12]> Slice6 <tibble [18 x 8]> <tibble [0 x 3]>

$tuned_info$grid_size
[1] 3

$tuned_info$best_metric
[1] "rmse"

$tuned_info$best_result_set
# A tibble: 1 x 10
   cost degree scale_factor margin .metric .estimator  mean     n std_err .config             
  <dbl>  <int>        <dbl>  <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1  16.5      2      0.00332  0.152 rmse    standard    67.5     6    2.65 Preprocessor1_Model3

$tuned_info$tuning_grid_plot
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

$tuned_info$plotly_grid_plot

There were 50 or more warnings (use warnings() to see the first 50)

image