Closed spsanderson closed 2 years ago
Function:
#' Boilerplate Workflow #' #' @family Boiler_Plate #' @family SVM #' @family poly #' #' @author Steven P. Sanderson II, MPH #' #' @details This uses `parsnip::svm_poly()` and sets the `parsnip::engine` #' to `kernlab`. #' #' @seealso \url{https://parsnip.tidymodels.org/reference/svm_poly.html} #' #' @description This is a boilerplate function to automatically create the following: #' - recipe #' - model specification #' - workflow #' - tuned model (grid ect) #' - calibration tibble and plot #' #' @param .data The data being passed to the function. The time-series object. #' @param .date_col The column that holds the datetime. #' @param .value_col The column that has the value #' @param .formula The formula that is passed to the recipe like `value ~ .` #' @param .rsamp_obj The rsample splits object #' @param .prefix Default is `ts_smooth_es` #' @param .tune Defaults to TRUE, this creates a tuning grid and tuned model. #' @param .grid_size If `.tune` is TRUE then the `.grid_size` is the size of the #' tuning grid. #' @param .num_cores How many cores do you want to use. Default is 1 #' @param .cv_assess How many observations for assess. See [timetk::time_series_cv()] #' @param .cv_skip How many observations to skip. See [timetk::time_series_cv()] #' @param .cv_slice_limit How many slices to return. See [timetk::time_series_cv()] #' @param .best_metric Default is "rmse". See [modeltime::default_forecast_accuracy_metric_set()] #' @param .bootstrap_final Not yet implemented. #' #' @examples #' \dontrun{ #' library(dplyr) #' #' data <- AirPassengers %>% #' ts_to_tbl() %>% #' select(-index) #' #' splits <- time_series_split( #' data #' , date_col #' , assess = 12 #' , skip = 3 #' , cumulative = TRUE #' ) #' #' ts_auto_poly <- ts_auto_svm_poly( #' .data = data, #' .num_cores = 5, #' .date_col = date_col, #' .value_col = value, #' .rsamp_obj = splits, #' .formula = value ~ ., #' .grid_size = 3 #' ) #' #' ts_smooth_poly$recipe_info #' } #' #' @return #' A list #' #' @export #' ts_auto_svm_poly <- function(.data, .date_col, .value_col, .formula, .rsamp_obj, .prefix = "ts_svm_poly", .tune = TRUE, .grid_size = 10, .num_cores = 1, .cv_assess = 12, .cv_skip = 3, .cv_slice_limit = 6, .best_metric = "rmse", .bootstrap_final = FALSE){ # Tidyeval ---- date_col_var_expr <- rlang::enquo(.date_col) value_col_var_expr <- rlang::enquo(.value_col) sampling_object <- .rsamp_obj # Cross Validation cv_assess = as.numeric(.cv_assess) cv_skip = as.numeric(.cv_skip) cv_slice = as.numeric(.cv_slice_limit) # Tuning Grid grid_size <- as.numeric(.grid_size) num_cores <- as.numeric(.num_cores) best_metric <- as.character(.best_metric) # Data and splits splits <- .rsamp_obj data_tbl <- dplyr::as_tibble(.data) # Checks ---- if (rlang::quo_is_missing(date_col_var_expr)){ rlang::abort( message = "'.date_col' must be supplied.", use_cli_format = TRUE ) } if (rlang::quo_is_missing(value_col_var_expr)){ rlang::abort( message = "'.value_col' must be supplied.", use_cli_format = TRUE ) } if (!inherits(x = splits, what = "rsplit")){ rlang::abort( message = "'.rsamp_obj' must be have class rsplit, use the rsample package.", use_cli_format = TRUE ) } # Recipe ---- # Get the initial recipe call recipe_call <- get_recipe_call(match.call()) rec_syntax <- paste0(.prefix, "_recipe") %>% assign_value(!!recipe_call) rec_obj <- recipes::recipe(formula = .formula, data = data_tbl) rec_obj <- rec_obj %>% timetk::step_timeseries_signature({{date_col_var_expr}}) %>% timetk::step_holiday_signature({{date_col_var_expr}}) %>% recipes::step_novel(recipes::all_nominal_predictors()) %>% recipes::step_mutate_at(tidyselect::vars_select_helpers$where(is.character) , fn = ~ as.factor(.)) %>% recipes::step_dummy(recipes::all_nominal(), one_hot = TRUE) %>% recipes::step_normalize(recipes::all_numeric_predictors(), -date_col_index.num) %>% recipes::step_nzv(recipes::all_predictors(), -date_col_index.num) %>% recipes::step_corr(recipes::all_numeric_predictors(), threshold = 0) # Tune/Spec ---- if (.tune){ model_spec <- parsnip::svm_poly( cost = tune::tune(), degree = tune::tune(), scale_factor = tune::tune(), margin = tune::tune() ) } else { model_spec <- parsnip::svm_poly() } model_spec <- model_spec %>% parsnip::set_mode(mode = "regression") %>% parsnip::set_engine("kernlab") # Workflow ---- wflw <- workflows::workflow() %>% workflows::add_recipe(rec_obj) %>% workflows::add_model(model_spec) # Tuning Grid ---- if (.tune){ tuning_grid_spec <- dials::grid_latin_hypercube( hardhat::extract_parameter_set_dials(model_spec), size = grid_size ) # Make TS CV ---- tscv <- timetk::time_series_cv( data = rsample::training(splits), date_var = {{date_col_var_expr}}, cumulative = TRUE, assess = cv_assess, skip = cv_skip, slice_limit = cv_slice ) # Start parallel backend modeltime::parallel_start(num_cores) # Tune the workflow tuned_results <- wflw %>% tune::tune_grid( resamples = tscv, grid = tuning_grid_spec, metrics = modeltime::default_forecast_accuracy_metric_set() ) # Stop parallel backend modeltime::parallel_stop() # Get the best result set by a specified metric best_result_set <- tuned_results %>% tune::show_best(metric = best_metric, n = 1) # Plot results tune_results_plt <- tuned_results %>% tune::autoplot() + ggplot2::theme_minimal() + ggplot2::geom_smooth(se = FALSE) # Make final workflow wflw_fit <- wflw %>% tune::finalize_workflow( tuned_results %>% tune::show_best(metric = best_metric, n = Inf) %>% dplyr::slice(1) ) %>% parsnip::fit(rsample::training(splits)) } else { wflw_fit <- wflw %>% parsnip::fit(rsample::training(splits)) } # Calibrate and Plot ---- cap <- healthyR.ts::calibrate_and_plot( wflw_fit, .splits_obj = splits, .data = data_tbl, .interactive = TRUE, .print_info = FALSE ) # Return ---- output <- list( recipe_info = list( recipe_call = recipe_call, recipe_syntax = rec_syntax, rec_obj = rec_obj ), model_info = list( model_spec = model_spec, wflw = wflw, fitted_wflw = wflw_fit, was_tuned = ifelse(.tune, "tuned", "not_tuned") ), model_calibration = list( plot = cap$plot, calibration_tbl = cap$calibration_tbl, model_accuracy = cap$model_accuracy ) ) if (.tune){ output$tuned_info = list( tuning_grid = tuning_grid_spec, tscv = tscv, tuned_results = tuned_results, grid_size = grid_size, best_metric = best_metric, best_result_set = best_result_set, tuning_grid_plot = tune_results_plt, plotly_grid_plot = plotly::ggplotly(tune_results_plt) ) } return(invisible(output)) }
Example:
> ts_auto_poly $recipe_info $recipe_info$recipe_call recipe(.data = data, .date_col = date_col, .value_col = value, .formula = value ~ ., .rsamp_obj = splits, .grid_size = 3, .num_cores = 5) $recipe_info$recipe_syntax [1] "ts_svm_poly_recipe <-" [2] "\n recipe(.data = data, .date_col = date_col, .value_col = value, .formula = value ~ \n ., .rsamp_obj = splits, .grid_size = 3, .num_cores = 5)" $recipe_info$rec_obj Recipe Inputs: role #variables outcome 1 predictor 1 Operations: Timeseries signature features from date_col Holiday signature features from date_col Novel factor level assignment for recipes::all_nominal_predictors() Variable mutation for tidyselect::vars_select_helpers$where(is.character) Dummy variables from recipes::all_nominal() Centering and scaling for recipes::all_numeric_predictors(), -date_col_index.num Sparse, unbalanced variable filter on recipes::all_predictors(), -date_col_index.num Correlation filter on recipes::all_numeric_predictors() $model_info $model_info$model_spec Polynomial Support Vector Machine Specification (regression) Main Arguments: cost = tune::tune() degree = tune::tune() scale_factor = tune::tune() margin = tune::tune() Computational engine: kernlab $model_info$wflw == Workflow ========================================================================================= Preprocessor: Recipe Model: svm_poly() -- Preprocessor ------------------------------------------------------------------------------------- 8 Recipe Steps * step_timeseries_signature() * step_holiday_signature() * step_novel() * step_mutate_at() * step_dummy() * step_normalize() * step_nzv() * step_corr() -- Model -------------------------------------------------------------------------------------------- Polynomial Support Vector Machine Specification (regression) Main Arguments: cost = tune::tune() degree = tune::tune() scale_factor = tune::tune() margin = tune::tune() Computational engine: kernlab $model_info$fitted_wflw == Workflow [trained] =============================================================================== Preprocessor: Recipe Model: svm_poly() -- Preprocessor ------------------------------------------------------------------------------------- 8 Recipe Steps * step_timeseries_signature() * step_holiday_signature() * step_novel() * step_mutate_at() * step_dummy() * step_normalize() * step_nzv() * step_corr() -- Model -------------------------------------------------------------------------------------------- Support Vector Machine object of class "ksvm" SV type: eps-svr (regression) parameter : epsilon = 0.152480707721164 cost C = 16.5459568852013 Polynomial kernel function. Hyperparameters : degree = 2 scale = 0.00332334037030967 offset = 1 Number of Support Vectors : 77 Objective Function Value : -374.4418 Training error : 0.161677 $model_info$was_tuned [1] "tuned" $model_calibration $model_calibration$plot $model_calibration$calibration_tbl # Modeltime Table # A tibble: 1 x 5 .model_id .model .model_desc .type .calibration_data <int> <list> <chr> <chr> <list> 1 1 <workflow> KERNLAB Test <tibble [12 x 4]> $model_calibration$model_accuracy # A tibble: 1 x 9 .model_id .model_desc .type mae mape mase smape rmse rsq <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 1 KERNLAB Test 62.6 11.7 1.30 12.9 88.2 0.0354 $tuned_info $tuned_info$tuning_grid # A tibble: 3 x 4 cost degree scale_factor margin <dbl> <int> <dbl> <dbl> 1 0.00121 3 1.95e- 5 0.0747 2 0.481 1 1.08e-10 0.0216 3 16.5 2 3.32e- 3 0.152 $tuned_info$tscv # Time Series Cross Validation Plan # A tibble: 6 x 2 splits id <list> <chr> 1 <split [120/12]> Slice1 2 <split [117/12]> Slice2 3 <split [114/12]> Slice3 4 <split [111/12]> Slice4 5 <split [108/12]> Slice5 6 <split [105/12]> Slice6 $tuned_info$tuned_results # Tuning results # NA # A tibble: 6 x 4 splits id .metrics .notes <list> <chr> <list> <list> 1 <split [120/12]> Slice1 <tibble [18 x 8]> <tibble [0 x 3]> 2 <split [117/12]> Slice2 <tibble [18 x 8]> <tibble [0 x 3]> 3 <split [114/12]> Slice3 <tibble [18 x 8]> <tibble [0 x 3]> 4 <split [111/12]> Slice4 <tibble [18 x 8]> <tibble [0 x 3]> 5 <split [108/12]> Slice5 <tibble [18 x 8]> <tibble [0 x 3]> 6 <split [105/12]> Slice6 <tibble [18 x 8]> <tibble [0 x 3]> $tuned_info$grid_size [1] 3 $tuned_info$best_metric [1] "rmse" $tuned_info$best_result_set # A tibble: 1 x 10 cost degree scale_factor margin .metric .estimator mean n std_err .config <dbl> <int> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> 1 16.5 2 0.00332 0.152 rmse standard 67.5 6 2.65 Preprocessor1_Model3 $tuned_info$tuning_grid_plot `geom_smooth()` using method = 'loess' and formula 'y ~ x' $tuned_info$plotly_grid_plot There were 50 or more warnings (use warnings() to see the first 50)
Function:
Example: