EmilHvitfeldt / extrasteps

More Steps for the 'recipes' Package
https://emilhvitfeldt.github.io/extrasteps/
Other
6 stars 1 forks source link

combinatorical dummy variables #49

Open EmilHvitfeldt opened 1 year ago

EmilHvitfeldt commented 1 year ago

Instead of a, b, c. You instead have a or b, a or c, b or c.

Main problem, you would need some way to reduce the number of resulting categorical variables

library(tidyverse)
library(modeldata)
data("ames")
x <- ames$MS_Zoning
combs <- combinat::combn(levels(x), 2)
out <- matrix(nrow = length(x), ncol = ncol(combs))

for (i in seq_len(ncol(out))) {
  out[, i] <- as.integer(x %in% combs[, i])
}

x[1:3]
#> [1] Residential_Low_Density  Residential_High_Density Residential_Low_Density 
#> 7 Levels: Floating_Village_Residential ... I_all
out[1:3, ]
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
#> [1,]    0    1    0    0    0    0    1    0    0     0     0     1     1     1
#> [2,]    1    0    0    0    0    0    1    1    1     1     1     0     0     0
#> [3,]    0    1    0    0    0    0    1    0    0     0     0     1     1     1
#>      [,15] [,16] [,17] [,18] [,19] [,20] [,21]
#> [1,]     1     0     0     0     0     0     0
#> [2,]     0     0     0     0     0     0     0
#> [3,]     1     0     0     0     0     0     0

library(recipes)
#> 
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stringr':
#> 
#>     fixed
#> The following object is masked from 'package:stats':
#> 
#>     step
recipe(~., data = as.data.frame(out)) %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), threshold = 0.9) %>%
  prep() %>%
  bake(new_data = NULL) %>%
  ggplot(aes(PC1, PC2)) +
  geom_point()

recipe(~., data = as.data.frame(out)) %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), threshold = 0.9) %>%
  prep() %>%
  bake(new_data = NULL)
#> # A tibble: 2,930 × 4
#>      PC1     PC2     PC3       PC4
#>    <dbl>   <dbl>   <dbl>     <dbl>
#>  1 -1.65  0.0840 -0.0140  0.000213
#>  2  4.90 -5.70   11.7    10.5     
#>  3 -1.65  0.0840 -0.0140  0.000213
#>  4 -1.65  0.0840 -0.0140  0.000213
#>  5 -1.65  0.0840 -0.0140  0.000213
#>  6 -1.65  0.0840 -0.0140  0.000213
#>  7 -1.65  0.0840 -0.0140  0.000213
#>  8 -1.65  0.0840 -0.0140  0.000213
#>  9 -1.65  0.0840 -0.0140  0.000213
#> 10 -1.65  0.0840 -0.0140  0.000213
#> # … with 2,920 more rows
recipe(~., data = as.data.frame(out)) %>%
  step_normalize(all_predictors()) %>%
  step_corr(all_predictors()) %>%
  prep() %>%
  bake(new_data = NULL)
#> # A tibble: 2,930 × 7
#>        V3     V6     V9    V10    V18     V19     V20
#>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
#>  1 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  2 -0.508 -0.225 10.0    7.44  -0.434 -0.0964 -0.0370
#>  3 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  4 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  5 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  6 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  7 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  8 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  9 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#> 10 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#> # … with 2,920 more rows
recipe(~., data = as.data.frame(out)) %>%
  step_normalize(all_predictors()) %>%
  step_corr(all_predictors()) %>%
  prep() %>%
  bake(new_data = NULL)
#> # A tibble: 2,930 × 7
#>        V3     V6     V9    V10    V18     V19     V20
#>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
#>  1 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  2 -0.508 -0.225 10.0    7.44  -0.434 -0.0964 -0.0370
#>  3 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  4 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  5 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  6 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  7 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  8 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#>  9 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#> 10 -0.508 -0.225 -0.100 -0.134 -0.434 -0.0964 -0.0370
#> # … with 2,920 more rows

Created on 2022-10-31 with reprex v2.0.2