topepo / caret

caret (Classification And Regression Training) R package that contains misc functions for training and plotting classification and regression models
http://topepo.github.io/caret/index.html
1.61k stars 634 forks source link

(1) preProcess not working inside train? (2) make preProcess transform response inside train. #1338

Open abanihas opened 1 year ago

abanihas commented 1 year ago

train, preProcess Bug? Categorical variables preprocessed differently inside train vs. outside train. Bug? Does not preprocess inside train despite saying it does. Enhancement: Make preProcess transform response inside train Thanks for caret and tidy models!

library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice

df <- mtcars[c("mpg", "wt", "am")]
# Change am to factor ("0" or "1")
df$am <- factor(df$am)
df[c(1, 4), ]
#>                 mpg    wt am
#> Mazda RX4      21.0 2.620  1
#> Hornet 4 Drive 21.4 3.215  0
str(df, nchar.max = 20)
#> 'data.frame':    32 obs. of  3 variables:
#>  $ mpg: num  21 21| __truncated__ ...
#>  $ wt : num  2.62 | __truncated__ ...
#>  $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...

# preProcess only transforms predictors inside train, not response.
# Exclude response (mpg) to make preProcesses of two models similar.
preProcValues <- caret::preProcess(df[c("wt", "am")], method = c("center", "scale"))
trainTransformed <- predict(preProcValues, df)
model_formula <- formula(mpg ~ wt + am)

# transformed outside train
preproc_out <- caret::train(
  form = model_formula,
  data = trainTransformed,
  method = "lm"
)

# transformed inside train
preproc_in <- caret::train(
  form = model_formula,
  data = df,
  method = "lm",
  preProcess = c("center", "scale")
)

preProcValues # Categorical variable ignored outside train
#> Created from 32 samples and 2 variables
#> 
#> Pre-processing:
#>   - centered (1)
#>   - ignored (1)
#>   - scaled (1)
preproc_in$preProcess # Categorical variable not ignored inside train
#> Created from 32 samples and 2 variables
#> 
#> Pre-processing:
#>   - centered (2)
#>   - ignored (0)
#>   - scaled (2)
preproc_out$trainingData[c(1, 4), ] # centred and scaled outside train
#>                .outcome           wt am
#> Mazda RX4          21.0 -0.610399567  1
#> Hornet 4 Drive     21.4 -0.002299538  0
preproc_in$trainingData[c(1, 4), ] # not centred and scaled inside train, but says they were
#>                .outcome    wt am
#> Mazda RX4          21.0 2.620  1
#> Hornet 4 Drive     21.4 3.215  0

identical(preProcValues, preproc_in$preProcess)
#> [1] FALSE
identical(preproc_out$trainingData, preproc_in$trainingData)
#> [1] FALSE
identical(preproc_out$finalModel, preproc_in$finalModel)
#> [1] FALSE

Created on 2023-05-06 with reprex v2.0.2

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.2.3 (2023-03-15) #> os macOS Ventura 13.3.1 #> system aarch64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz America/Toronto #> date 2023-05-06 #> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> caret * 6.0-94 2023-03-21 [1] CRAN (R 4.2.0) #> class 7.3-22 2023-05-03 [1] CRAN (R 4.2.3) #> cli 3.6.1 2023-03-23 [1] CRAN (R 4.2.0) #> codetools 0.2-19 2023-02-01 [1] CRAN (R 4.2.3) #> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0) #> data.table 1.14.8 2023-02-17 [1] CRAN (R 4.2.0) #> digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.0) #> dplyr 1.1.2 2023-04-20 [1] CRAN (R 4.2.0) #> evaluate 0.21 2023-05-05 [1] CRAN (R 4.2.0) #> fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.0) #> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.0) #> foreach 1.5.2 2022-02-02 [1] CRAN (R 4.2.0) #> fs 1.6.2 2023-04-25 [1] CRAN (R 4.2.0) #> future 1.32.0 2023-03-07 [1] CRAN (R 4.2.0) #> future.apply 1.10.0 2022-11-05 [1] CRAN (R 4.2.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0) #> ggplot2 * 3.4.2 2023-04-03 [1] CRAN (R 4.2.0) #> globals 0.16.2 2022-11-21 [1] CRAN (R 4.2.0) #> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0) #> gower 1.0.1 2022-12-22 [1] CRAN (R 4.2.0) #> gtable 0.3.3 2023-03-21 [1] CRAN (R 4.2.0) #> hardhat 1.3.0 2023-03-30 [1] CRAN (R 4.2.3) #> htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.2.0) #> ipred 0.9-14 2023-03-09 [1] CRAN (R 4.2.0) #> iterators 1.0.14 2022-02-05 [1] CRAN (R 4.2.0) #> knitr 1.42 2023-01-25 [1] CRAN (R 4.2.0) #> lattice * 0.21-8 2023-04-05 [1] CRAN (R 4.2.0) #> lava 1.7.2.1 2023-02-27 [1] CRAN (R 4.2.0) #> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0) #> listenv 0.9.0 2022-12-16 [1] CRAN (R 4.2.0) #> lubridate 1.9.2 2023-02-10 [1] CRAN (R 4.2.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0) #> MASS 7.3-60 2023-05-04 [1] CRAN (R 4.2.0) #> Matrix 1.5-4 2023-04-04 [1] CRAN (R 4.2.0) #> ModelMetrics 1.2.2.2 2020-03-17 [1] CRAN (R 4.2.0) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0) #> nlme 3.1-162 2023-01-31 [1] CRAN (R 4.2.3) #> nnet 7.3-19 2023-05-03 [1] CRAN (R 4.2.3) #> parallelly 1.35.0 2023-03-23 [1] CRAN (R 4.2.0) #> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0) #> plyr 1.8.8 2022-11-11 [1] CRAN (R 4.2.0) #> pROC 1.18.0 2021-09-03 [1] CRAN (R 4.2.0) #> prodlim 2023.03.31 2023-04-02 [1] CRAN (R 4.2.0) #> purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0) #> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0) #> R.utils 2.12.2 2022-11-11 [1] CRAN (R 4.2.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0) #> Rcpp 1.0.10 2023-01-22 [1] CRAN (R 4.2.0) #> recipes 1.0.6 2023-04-25 [1] CRAN (R 4.2.0) #> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0) #> reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.2.0) #> rlang 1.1.1 2023-04-28 [1] CRAN (R 4.2.0) #> rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.2.3) #> rpart 4.1.19 2022-10-21 [1] CRAN (R 4.2.3) #> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.0) #> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0) #> stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0) #> stringr 1.5.0 2022-12-02 [1] CRAN (R 4.2.0) #> styler 1.9.1 2023-03-04 [1] CRAN (R 4.2.0) #> survival 3.5-5 2023-03-12 [1] CRAN (R 4.2.0) #> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.2.0) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0) #> timechange 0.2.0 2023-01-11 [1] CRAN (R 4.2.0) #> timeDate 4022.108 2023-01-07 [1] CRAN (R 4.2.0) #> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0) #> vctrs 0.6.2 2023-04-19 [1] CRAN (R 4.2.0) #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0) #> xfun 0.39 2023-04-20 [1] CRAN (R 4.2.0) #> yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0) #> #> [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```