spsanderson / healthyR.ai

healthyR.ai - AI package for the healthyverse
http://www.spsanderson.com/healthyR.ai/
Other
16 stars 6 forks source link

Dummy encode character and factor columns #207

Closed spsanderson closed 2 years ago

spsanderson commented 2 years ago

Function:

hai_knn_data_prepper <- function(.data, .recipe_formula){

  # Recipe ---
  rec_obj <- recipes::recipe(.recipe_formula, data = data_tbl) %>%
    recipes::step_novel(recipes::all_nominal_predictors()) %>%
    recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = TRUE) %>%
    recipes::step_zv(recipes::all_predictors()) %>%
    recipes::step_normalize(recipes::all_numeric())

  # Return ----
  return(rec_obj)

}

Examples:

> hai_knn_data_prepper(iris, Species ~ .)
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          4

Operations:

Dummy variables from recipes::all_nominal_predictors()
Centering and scaling for recipes::all_numeric()

> hai_knn_data_prepper(iris, Species ~ .) %>% prep() %>% bake(iris)
# A tibble: 150 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
 1       -0.898      1.02          -1.34       -1.31 setosa 
 2       -1.14      -0.132         -1.34       -1.31 setosa 
 3       -1.38       0.327         -1.39       -1.31 setosa 
 4       -1.50       0.0979        -1.28       -1.31 setosa 
 5       -1.02       1.25          -1.34       -1.31 setosa 
 6       -0.535      1.93          -1.17       -1.05 setosa 
 7       -1.50       0.786         -1.34       -1.18 setosa 
 8       -1.02       0.786         -1.28       -1.31 setosa 
 9       -1.74      -0.361         -1.34       -1.31 setosa 
10       -1.14       0.0979        -1.28       -1.44 setosa 
# ... with 140 more rows

> hai_knn_data_prepper(Titanic, Survived ~ .) %>% prep() %>% bake(Titanic)
# A tibble: 32 x 10
        n Survived Class_X1st Class_X2nd Class_X3rd Class_Crew Sex_Female Sex_Male
    <dbl> <fct>         <dbl>      <dbl>      <dbl>      <dbl>      <dbl>    <dbl>
 1 -0.506 No            1.70      -0.568     -0.568     -0.568     -0.984    0.984
 2 -0.506 No           -0.568      1.70      -0.568     -0.568     -0.984    0.984
 3 -0.248 No           -0.568     -0.568      1.70      -0.568     -0.984    0.984
 4 -0.506 No           -0.568     -0.568     -0.568      1.70      -0.984    0.984
 5 -0.506 No            1.70      -0.568     -0.568     -0.568      0.984   -0.984
 6 -0.506 No           -0.568      1.70      -0.568     -0.568      0.984   -0.984
 7 -0.381 No           -0.568     -0.568      1.70      -0.568      0.984   -0.984
 8 -0.506 No           -0.568     -0.568     -0.568      1.70       0.984   -0.984
 9  0.362 No            1.70      -0.568     -0.568     -0.568     -0.984    0.984
10  0.627 No           -0.568      1.70      -0.568     -0.568     -0.984    0.984
# ... with 22 more rows, and 2 more variables: Age_Adult <dbl>, Age_Child <dbl>