Closed 2020new closed 4 years ago
Hi, the unprocessed test data should be supplied to the response()
and predict()
functions when using the recipe. In particular, the "Test set performance" code should be as follows.
obs_1 <- response(model_fit_rec, newdata = data_in_scope_test)
pred_1 <- predict(model_fit_rec, newdata = data_in_scope_test, type = "prob")
performance(obs_1, pred_1)
Thanks !
I've got two different auc values when I use the fomula and the recipe. `library(recipes) library(MachineShop) library(rsample) Telco_customer <- read.csv("D:/Quellen/Telecom customer/WAFn-UseC-Telco-Customer-Churn.csv") data_set <- Telco_customer%>%dplyr::select(-"customerID")%>%tidyr::drop_na()
rename the target variable (Churn in my case) to Target
data_in_scope <- data_set%>% plyr::rename(c("Churn" = "Target"))
Severity of class imbalance
round(prop.table(table(data_in_scope$Target)), 2)
Split data into train and test data
set.seed(2020) train_test_split_data <- initial_split(data_in_scope) data_in_scope_train <- training(train_test_split_data) data_in_scope_test <- testing(train_test_split_data)
Pre-Processing the data with{recipes}
set.seed(2020) rec_app <- recipe(Target ~., data = data_in_scope_train) %>% # Fomula step_dummy(all_nominal(), -Target) %>% # convert nominal data into one or more numeric. step_corr(all_predictors()) %>% # remove variables that have large absolute correlations with other variables. step_center(all_numeric(), -all_outcomes())%>% # normalize numeric data to have a mean of zero. step_scale(all_numeric(), -all_outcomes()) # normalize numeric data to have a standard deviation of one. trained_rec_app <- prep(rec_app, training = data_in_scope_train, retain = TRUE)
Apply to train and test set
train_data <- as.data.frame(juice(trained_rec_app)) test_data <- as.data.frame( bake(trained_rec_app, new_data = data_in_scope_test))
#####################################
Parallel processing
library(doParallel) registerDoParallel(cores = 3)
Model fit using formula
set.seed(2020) model_fit_fo <- fit(Target ~ ., data = train_data, model = GBMModel)
Test set performance
obs_fo <- response(model_fit_fo, newdata = test_data) pred_fo <- predict(model_fit_fo, newdata = test_data, type = "prob") performance(obs_fo, pred_fo)
Model fit using recipe
set.seed(2020) model_fit_rec <- fit(rec_app, model= GBMModel)
Test set performance
obs_1 <- response(model_fit_rec, newdata = test_data) pred_1 <- predict(model_fit_rec, newdata = test_data, type = "prob") performance(obs_1, pred_1)`