spsanderson / healthyR.ai

healthyR.ai - AI package for the healthyverse
http://www.spsanderson.com/healthyR.ai/
Other
16 stars 6 forks source link

hai_xgboost_data_prepper() #275

Closed spsanderson closed 2 years ago

spsanderson commented 2 years ago

Function:

#' Prep Data for XGBoost - Recipe
#'
#' @family Preprocessor
#' @family XBGoost
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will automatically prep your data.frame/tibble for
#' use in the XGBoost algorithm.
#'
#' This function will output a recipe specification.
#'
#' @description Automatically prep a data.frame/tibble for use in the xgboost algorithm.
#' 
#' @seealso \url{https://parsnip.tidymodels.org/reference/details_boost_tree_xgboost.html}
#'
#' @param .data The data that you are passing to the function. Can be any type
#' of data that is accepted by the `data` parameter of the `recipes::reciep()`
#' function.
#' @param .recipe_formula The formula that is going to be passed. For example
#' if you are using the `diamonds` data then the formula would most likely be something
#' like `price ~ .`
#'
#' @examples
#' # Regression
#' hai_xgboost_data_prepper(.data = diamonds, .recipe_formula = price ~ .)
#' reg_obj <- hai_xgboost_data_prepper(diamonds, price ~ .)
#' get_juiced_data(reg_obj)
#' 
#' # Classification
#' hai_xgboost_data_prepper(Titanic, Survived ~ .)
#' cla_obj <- hai_xgboost_data_prepper(Titanic, Survived ~ .)
#' get_juiced_data(cla_obj)
#'
#' @return
#' A recipe object
#'
#' @export
#'

hai_xgboost_data_prepper <- function(.data, .recipe_formula){

  # Recipe ---
  rec_obj <- recipes::recipe(.recipe_formula, data = .data) %>%
    recipes::step_string2factor(tidyselect::vars_select_helpers$where(is.character)) %>%
    recipes::step_novel(recipes::all_nominal_predictors()) %>%
    recipes::step_dummy(recipes::all_nominal_predictors()) %>%
    recipes::step_zv(recipes::all_predictors())

  # Return ----
  return(rec_obj)

}

Example:

> hai_xgboost_data_prepper(.data = diamonds, .recipe_formula = price ~ .)
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          9

Operations:

Factor variables from tidyselect::vars_select_helpers$where(is.character)
Novel factor level assignment for recipes::all_nominal_predictors()
Dummy variables from recipes::all_nominal_predictors()
Zero variance filter on recipes::all_predictors()
> reg_obj <- hai_xgboost_data_prepper(diamonds, price ~ .)
> get_juiced_data(reg_obj)
# A tibble: 53,940 x 27
   carat depth table     x     y     z price  cut_1  cut_2  cut_3  cut_4   cut_5 color_1
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
 1  0.23  61.5    55  3.95  3.98  2.43   326  0.359 -0.109 -0.522 -0.567 -0.315  -0.386 
 2  0.21  59.8    61  3.89  3.84  2.31   326  0.120 -0.436 -0.298  0.378  0.630  -0.386 
 3  0.23  56.9    65  4.05  4.07  2.31   327 -0.359 -0.109  0.522 -0.567  0.315  -0.386 
 4  0.29  62.4    58  4.2   4.23  2.63   334  0.120 -0.436 -0.298  0.378  0.630   0.231 
 5  0.31  63.3    58  4.34  4.35  2.75   335 -0.359 -0.109  0.522 -0.567  0.315   0.386 
 6  0.24  62.8    57  3.94  3.96  2.48   336 -0.120 -0.436  0.298  0.378 -0.630   0.386 
 7  0.24  62.3    57  3.95  3.98  2.47   336 -0.120 -0.436  0.298  0.378 -0.630   0.231 
 8  0.26  61.9    55  4.07  4.11  2.53   337 -0.120 -0.436  0.298  0.378 -0.630   0.0772
 9  0.22  65.1    61  3.87  3.78  2.49   337 -0.598  0.546 -0.373  0.189 -0.0630 -0.386 
10  0.23  59.4    61  4     4.05  2.39   338 -0.120 -0.436  0.298  0.378 -0.630   0.0772
# ... with 53,930 more rows, and 14 more variables: color_2 <dbl>, color_3 <dbl>,
#   color_4 <dbl>, color_5 <dbl>, color_6 <dbl>, color_7 <dbl>, clarity_1 <dbl>,
#   clarity_2 <dbl>, clarity_3 <dbl>, clarity_4 <dbl>, clarity_5 <dbl>, clarity_6 <dbl>,
#   clarity_7 <dbl>, clarity_8 <dbl>
> 
> # Classification
> hai_xgboost_data_prepper(Titanic, Survived ~ .)
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          4

Operations:

Factor variables from tidyselect::vars_select_helpers$where(is.character)
Novel factor level assignment for recipes::all_nominal_predictors()
Dummy variables from recipes::all_nominal_predictors()
Zero variance filter on recipes::all_predictors()
> cla_obj <- hai_xgboost_data_prepper(Titanic, Survived ~ .)
> get_juiced_data(cla_obj)
# A tibble: 32 x 7
       n Survived Class_X2nd Class_X3rd Class_Crew Sex_Male Age_Child
   <dbl> <fct>         <dbl>      <dbl>      <dbl>    <dbl>     <dbl>
 1     0 No                0          0          0        1         1
 2     0 No                1          0          0        1         1
 3    35 No                0          1          0        1         1
 4     0 No                0          0          1        1         1
 5     0 No                0          0          0        0         1
 6     0 No                1          0          0        0         1
 7    17 No                0          1          0        0         1
 8     0 No                0          0          1        0         1
 9   118 No                0          0          0        1         0
10   154 No                1          0          0        1         0
# ... with 22 more rows