tidymodels / recipes

Pipeable steps for feature engineering and data preprocessing to prepare for modeling
https://recipes.tidymodels.org
Other
560 stars 112 forks source link

`juice()` is reconverting strings to factors #317

Closed DavisVaughan closed 10 months ago

DavisVaughan commented 5 years ago

I want to use a recipe in train() and don't see the option to pass strings_as_factors = FALSE, so my first step is step_string2factor(all_nominal()).

In a test scenario, I run the following:

data(okc)
string_test <- recipe(~ diet + location, data = okc) %>% step_factor2string(diet)
prep(string_test, okc, strings_as_factors = TRUE, retain = TRUE) %>% juice

The output shows diet as a factor, which suggests to me that the step_factor2string was ignored. Above it is mentioned "thatprep() defaults with strings_as_factor = TRUE. So its converting Species to a factor before running any steps" so I understand in my test case:

Apologies if I am doing something wrong, or if not, whether this should be listed as a new issue - it seemed related to this one...

Originally posted by @mlduarte in https://github.com/tidymodels/recipes/issues/311#issuecomment-479265304

DavisVaughan commented 5 years ago

reprex

suppressPackageStartupMessages(library(recipes))
data(okc)

string_test <- recipe(~ diet + location, data = okc) %>% 
  step_factor2string(diet)

p_rec <- prep(string_test, okc, strings_as_factors = TRUE, retain = TRUE)

# diet is character
p_rec$template
#> # A tibble: 59,855 x 2
#>    diet              location           
#>    <chr>             <fct>              
#>  1 strictly anything south san francisco
#>  2 mostly other      oakland            
#>  3 anything          san francisco      
#>  4 vegetarian        berkeley           
#>  5 <NA>              san francisco      
#>  6 mostly anything   san francisco      
#>  7 strictly anything san francisco      
#>  8 mostly anything   san francisco      
#>  9 strictly anything belvedere tiburon  
#> 10 mostly anything   san mateo          
#> # … with 59,845 more rows

# but there are levels stored for `diet`
names(p_rec$levels)
#> [1] "diet"     "location"

# so juice() converts them to factors
# is this expected? I wouldn't think
# juice() would do anything like this
juice(p_rec)
#> # A tibble: 59,855 x 2
#>    diet              location           
#>    <fct>             <fct>              
#>  1 strictly anything south san francisco
#>  2 mostly other      oakland            
#>  3 anything          san francisco      
#>  4 vegetarian        berkeley           
#>  5 <NA>              san francisco      
#>  6 mostly anything   san francisco      
#>  7 strictly anything san francisco      
#>  8 mostly anything   san francisco      
#>  9 strictly anything belvedere tiburon  
#> 10 mostly anything   san mateo          
#> # … with 59,845 more rows

Created on 2019-04-04 by the reprex package (v0.2.1.9000)

EmilHvitfeldt commented 11 months ago

updated reprex:

library(recipes)
library(modeldata)

glimpse(hpc_cv)
#> Rows: 3,467
#> Columns: 7
#> $ obs      <fct> VF, VF, VF, VF, VF, VF, VF, VF, VF, VF, VF, VF, VF, VF, VF, V…
#> $ pred     <fct> VF, VF, VF, VF, VF, VF, VF, VF, VF, VF, F, F, VF, VF, VF, VF,…
#> $ VF       <dbl> 0.9136340, 0.9380672, 0.9473710, 0.9289077, 0.9418764, 0.9510…
#> $ F        <dbl> 0.07786694, 0.05710623, 0.04946767, 0.06528949, 0.05430830, 0…
#> $ M        <dbl> 0.0084791470, 0.0048164471, 0.0031562870, 0.0057871789, 0.003…
#> $ L        <dbl> 1.991225e-05, 1.011557e-05, 4.999849e-06, 1.564496e-05, 7.294…
#> $ Resample <chr> "Fold01", "Fold01", "Fold01", "Fold01", "Fold01", "Fold01", "…

string_test <- recipe(~ obs + pred + Resample, data = hpc_cv) %>% 
  step_factor2string(obs)

p_rec <- prep(string_test, hpc_cv, strings_as_factors = TRUE)

p_rec$template
#> # A tibble: 3,467 × 3
#>    obs   pred  Resample
#>    <chr> <fct> <fct>   
#>  1 VF    VF    Fold01  
#>  2 VF    VF    Fold01  
#>  3 VF    VF    Fold01  
#>  4 VF    VF    Fold01  
#>  5 VF    VF    Fold01  
#>  6 VF    VF    Fold01  
#>  7 VF    VF    Fold01  
#>  8 VF    VF    Fold01  
#>  9 VF    VF    Fold01  
#> 10 VF    VF    Fold01  
#> # ℹ 3,457 more rows

# but there are levels stored for `diet`
names(p_rec$levels)
#> [1] "obs"      "pred"     "Resample"

# so juice() and bake() converts them to factors is this expected? I wouldn't 
# think juice() would do anything like this
juice(p_rec)
#> # A tibble: 3,467 × 3
#>    obs   pred  Resample
#>    <fct> <fct> <fct>   
#>  1 VF    VF    Fold01  
#>  2 VF    VF    Fold01  
#>  3 VF    VF    Fold01  
#>  4 VF    VF    Fold01  
#>  5 VF    VF    Fold01  
#>  6 VF    VF    Fold01  
#>  7 VF    VF    Fold01  
#>  8 VF    VF    Fold01  
#>  9 VF    VF    Fold01  
#> 10 VF    VF    Fold01  
#> # ℹ 3,457 more rows

bake(p_rec, new_data = NULL)
#> # A tibble: 3,467 × 3
#>    obs   pred  Resample
#>    <fct> <fct> <fct>   
#>  1 VF    VF    Fold01  
#>  2 VF    VF    Fold01  
#>  3 VF    VF    Fold01  
#>  4 VF    VF    Fold01  
#>  5 VF    VF    Fold01  
#>  6 VF    VF    Fold01  
#>  7 VF    VF    Fold01  
#>  8 VF    VF    Fold01  
#>  9 VF    VF    Fold01  
#> 10 VF    VF    Fold01  
#> # ℹ 3,457 more rows

bake(p_rec, new_data = hpc_cv)
#> # A tibble: 3,467 × 3
#>    obs   pred  Resample
#>    <fct> <fct> <fct>   
#>  1 VF    VF    Fold01  
#>  2 VF    VF    Fold01  
#>  3 VF    VF    Fold01  
#>  4 VF    VF    Fold01  
#>  5 VF    VF    Fold01  
#>  6 VF    VF    Fold01  
#>  7 VF    VF    Fold01  
#>  8 VF    VF    Fold01  
#>  9 VF    VF    Fold01  
#> 10 VF    VF    Fold01  
#> # ℹ 3,457 more rows

Created on 2023-09-18 with reprex v2.0.2

github-actions[bot] commented 9 months ago

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex https://reprex.tidyverse.org) and link to this issue.