set.seed(2020)
rec <- recipe(Target ~.,
data = data_in_scope_train) %>% # Fomula
step_dummy(all_nominal(), -Target) %>% # convert nominal data into one or more numeric.
step_corr(all_predictors()) %>% # remove variables that have large absolute
correlations with other variables.
step_center(all_numeric(), -all_outcomes()) %>% # normalize numeric data to have a mean of zero.
step_scale(all_numeric(), -all_outcomes()) # normalize numeric data to have a standard deviation of one.
trained_rec <- prep(rec,
training = data_in_scope_train,
retain = TRUE)
Explainers build using keras model are not saved correct and fail with the message:
> Error: object of type 'closure' is not subsettable
renv::init()
Sys.setenv(RETICULATE_PYTHON = "/home/paulc/anaconda3/envs/polynote/bin/python") Sys.setenv(TENSORFLOW_PYTHON = "/home/paulc/anaconda3/envs/polynote/bin/python")
1.0 LOAD LIBRARY
suppressMessages(library(tidyverse)) suppressMessages(library(data.table))
> Warning: package 'data.table' was built under R version 4.0.3
suppressMessages(library(Matrix)) suppressMessages(library(matrixStats)) suppressMessages(library(magrittr)) suppressMessages(library(here)) suppressMessages(library(archivist)) suppressMessages(library(recipes)) suppressMessages(library(yardstick)) suppressMessages(library(tidymodels))
> Warning: package 'broom' was built under R version 4.0.3
> Warning: package 'modeldata' was built under R version 4.0.3
> Warning: package 'parsnip' was built under R version 4.0.3
suppressMessages(library(reticulate)) suppressMessages(library(keras)) suppressMessages(library(tune)) suppressMessages(library(workflows)) suppressMessages(library(foreach)) suppressMessages(library(doParallel)) suppressMessages(library(doSNOW)) suppressMessages(library(parallel)) suppressMessages(library(DALEX))
> Warning: package 'DALEX' was built under R version 4.0.3
2.0 LOAD DATA
df.data <- read.csv("https://raw.githubusercontent.com/treselle-systems/customer_churn_analysis/master/WA_Fn-UseC_-Telco-Customer-Churn.csv") glimpse(df.data)
> Rows: 7,043
> Columns: 21
> $ customerID "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOC…
> $ gender "Female", "Male", "Male", "Male", "Female", "Female"…
> $ SeniorCitizen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
> $ Partner "Yes", "No", "No", "No", "No", "No", "No", "No", "Ye…
> $ Dependents "No", "No", "No", "No", "No", "No", "Yes", "No", "No…
> $ tenure 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, …
> $ PhoneService "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No",…
> $ MultipleLines "No phone service", "No", "No", "No phone service", …
> $ InternetService "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber op…
> $ OnlineSecurity "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", …
> $ OnlineBackup "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "…
> $ DeviceProtection "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "…
> $ TechSupport "No", "No", "No", "Yes", "No", "No", "No", "No", "Ye…
> $ StreamingTV "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Y…
> $ StreamingMovies "No", "No", "No", "No", "No", "Yes", "No", "No", "Ye…
> $ Contract "Month-to-month", "One year", "Month-to-month", "One…
> $ PaperlessBilling "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No",…
> $ PaymentMethod "Electronic check", "Mailed check", "Mailed check", …
> $ MonthlyCharges 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.…
> $ TotalCharges 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 194…
> $ Churn "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "…
Make copy of df.data and drop the unneeded columns
data_set <- df.data %>% dplyr::select(-"customerID")
Rename the outcome variable (Churn in my case) to Target
data_in_scope <- data_set %>% plyr::rename(c("Churn" = "Target"))
Drop rows with missing value(11 missing values, very small percentage of our total data)
data_in_scope <- data_set %>% plyr::rename(c("Churn" = "Target")) %>% drop_na()
Split data into train and test data and create resamples for tuning
set.seed(2020) train_test_split_data <- initial_split(data_in_scope) data_in_scope_train <- training(train_test_split_data) data_in_scope_test <- testing(train_test_split_data)
Pre-Processing the data with{recipes}
set.seed(2020) rec <- recipe(Target ~., data = data_in_scope_train) %>% # Fomula step_dummy(all_nominal(), -Target) %>% # convert nominal data into one or more numeric. step_corr(all_predictors()) %>% # remove variables that have large absolute
correlations with other variables.
step_center(all_numeric(), -all_outcomes()) %>% # normalize numeric data to have a mean of zero. step_scale(all_numeric(), -all_outcomes()) # normalize numeric data to have a standard deviation of one.
trained_rec <- prep(rec, training = data_in_scope_train, retain = TRUE)
create the train and test set
train_data <- as.data.frame(juice(trained_rec)) test_data <- as.data.frame( bake(trained_rec, new_data = data_in_scope_test))
binary response
yTrain <- as.integer(ifelse(train_data$Target == "Yes", 1, 0)) yTest <- as.integer(ifelse(test_data$Target == "Yes", 1, 0))
predictors
train_X <- train_data[,-5] %>% as.matrix() test_X <- test_data[,-5] %>% as.matrix()
n_features <- ncol(train_X)
path to models
path <- "/home/paulc/projects_Paul/teste/archivist/models" file <- "keras_model.hdf5"
epochs <- 100 batch_size <- 64
3.0 CALLBACK'S
callbacks_list <- list( callback_early_stopping( monitor = "val_loss", patience = 10, min_delta = 0 ), callback_model_checkpoint( filepath = paste0(path, "/", file), monitor = "val_loss", save_best_only = TRUE ), callback_reduce_lr_on_plateau( monitor = "val_loss", factor = 0.1, patience = 2 ), callback_lambda( on_epoch_end = function(epoch, logs) { if (epoch %% 10 == 0) cat("\n") cat(".") } )
)
4.0 RUN MODEL
create_model <- function(Learning_rate = 1e-3, decay = 0){
KERAS clear session
k_clear_session()
set seed TF1
use_session_with_seed(2020, disable_gpu = TRUE, disable_parallel_cpu = FALSE) # seed: for reproducible research
Initialize sequential model
model <- keras_model_sequential() %>%
return(model) } create_model()
> Set session seed to 2020 (disabled GPU)
> Model
> Model: "sequential"
> ____
> Layer (type) Output Shape Param
> ================================================================================
> dense (Dense) (None, 256) 6144
> ____
> dropout (Dropout) (None, 256) 0
> ____
> gaussian_noise (GaussianNoise) (None, 256) 0
> ____
> dropout_1 (Dropout) (None, 256) 0
> ____
> dense_1 (Dense) (None, 64) 16448
> ____
> dropout_2 (Dropout) (None, 64) 0
> ____
> dense_2 (Dense) (None, 32) 2080
> ____
> dropout_3 (Dropout) (None, 32) 0
> ____
> dense_3 (Dense) (None, 1) 33
> ================================================================================
> Total params: 24,705
> Trainable params: 24,705
> Non-trainable params: 0
> ____
final_model <- create_model()
> Set session seed to 2020 (disabled GPU)
Train model
history <- final_model %>% fit( x = train_X , y = yTrain , epochs = epochs , batch_size = batch_size , validation_split = 0.2 , shuffle = TRUE , verbose = TRUE ,callbacks = callbacks_list )
>
> ..........
> ..........
> .
Save model
loaded_model <- load_model_hdf5(paste0(path, "/", file)) loaded_model
> Model
> Model: "sequential"
> ____
> Layer (type) Output Shape Param
> ================================================================================
> dense (Dense) (None, 256) 6144
> ____
> dropout (Dropout) (None, 256) 0
> ____
> gaussian_noise (GaussianNoise) (None, 256) 0
> ____
> dropout_1 (Dropout) (None, 256) 0
> ____
> dense_1 (Dense) (None, 64) 16448
> ____
> dropout_2 (Dropout) (None, 64) 0
> ____
> dense_2 (Dense) (None, 32) 2080
> ____
> dropout_3 (Dropout) (None, 32) 0
> ____
> dense_3 (Dense) (None, 1) 33
> ================================================================================
> Total params: 24,705
> Trainable params: 24,705
> Non-trainable params: 0
> ____
5.0 LOCAL GIT
loka_repo <- "/home/paulc/projects_Paul/teste/archivist/repo" setLocalRepo(loka_repo)
6.0 EXPLAINERS
df.test <- test_data %>% select(-Target)
Prediction functions
custom_predict_keras <- function(object, newdata) {pred <- as.data.frame(predict_proba(object, newdata)) response <- pred$V1 return(response)}
keras_explainer <- DALEX::explain(model = keras_model, data = as.matrix(df.test), y = yTest, predict_function = custom_predict_keras, label = "Neural Network model", colorize = FALSE)%a% asave(repoDir = loka_repo, userTags = c("tags", "keras_explainer"))
> Preparation of a new explainer is initiated
> -> model label : Neural Network model
> -> data : 1758 rows 23 cols
> -> data : rownames to data was added ( from 1 to 1758 )
> -> target variable : 1758 values
> -> predict function : custom_predict_keras
> -> predicted values : the predict_function returns an error when executed ( WARNING )
> -> model_info : package Model of class: function package unrecognized , ver. Unknown , task regression ( default )
> -> residual function : difference between y and yhat ( default )
> -> residuals : the residual_function returns an error when executed ( WARNING )
> A new explainer has been created!
searchInLocalRepo(pattern = "keras_explainer")
> [1] "5ff6baacb6af009334b549035eb9967c" "3d730e3b1cb8111a91632fbb2faad6ee"
> [3] "57f922abdb583dbe113200a37f1188ca"
keras_explainer <- loadFromLocalRepo(md5hash = '3d730e3b1cb8111a91632fbb2faad6ee', value = TRUE) keras_explainer
> Model label: Neural Network model
> Model class: function
> Data head :
> SeniorCitizen tenure MonthlyCharges TotalCharges gender_Male Partner_Yes
> 1 -0.4410523 -0.9121967 -1.171850 -0.8754606 -1.005609 -0.9725976
> 2 -0.4410523 -0.1814047 1.330267 0.3316549 -1.005609 1.0279795
> Dependents_Yes MultipleLines_No.phone.service MultipleLines_Yes
> 1 -0.6512835 2.9915291 -0.8492012
> 2 -0.6512835 -0.3342138 1.1773540
> InternetService_Fiber.optic OnlineSecurity_Yes OnlineBackup_Yes
> 1 -0.8888612 1.5728443 -0.7328433
> 2 1.1248218 -0.6356703 -0.7328433
> DeviceProtection_Yes TechSupport_Yes StreamingTV_Yes
> 1 -0.7307052 -0.6389055 -0.7890825
> 2 1.3682814 1.5648798 1.2670544
> StreamingMovies_No.internet.service StreamingMovies_Yes Contract_One.year
> 1 -0.5186113 -0.7998775 -0.5141934
> 2 -0.5186113 1.2499544 -0.5141934
> Contract_Two.year PaperlessBilling_Yes PaymentMethod_Credit.card..automatic.
> 1 -0.5602159 -1.2048268 -0.5288983
> 2 -0.5602159 0.8298375 -0.5288983
> PaymentMethod_Electronic.check PaymentMethod_Mailed.check
> 1 -0.706135 1.8453643
> 2 1.415891 -0.5417957
vi_keras <- model_parts(explainer = keras_explainer, loss_function = loss_one_minus_auc, type = "difference", B = 10, N = NULL)
> Error: object of type 'closure' is not subsettable
sessionInfo()
> R version 4.0.2 (2020-06-22)
> Platform: x86_64-pc-linux-gnu (64-bit)
> Running under: CentOS Linux 7 (Core)
>
> Matrix products: default
> BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.3.so
>
> locale:
> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
> [9] LC_ADDRESS=C LC_TELEPHONE=C
> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
>
> attached base packages:
> [1] parallel stats graphics grDevices datasets utils methods
> [8] base
>
> other attached packages:
> [1] DALEX_2.0.1 doSNOW_1.0.19 snow_0.4-3 doParallel_1.0.16
> [5] iterators_1.0.13 foreach_1.5.1 keras_2.3.0.0 reticulate_1.18
> [9] workflows_0.2.1 tune_0.1.1 rsample_0.0.8 parsnip_0.1.4
> [13] modeldata_0.1.0 infer_0.5.3 dials_0.0.9 scales_1.1.1
> [17] broom_0.7.2 tidymodels_0.1.1 yardstick_0.0.7 recipes_0.1.14
> [21] archivist_2.3.4 here_0.1 magrittr_1.5 matrixStats_0.57.0
> [25] Matrix_1.2-18 data.table_1.13.2 forcats_0.5.0 stringr_1.4.0
> [29] dplyr_1.0.2 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
> [33] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
>
> loaded via a namespace (and not attached):
> [1] readxl_1.3.1 backports_1.1.10 plyr_1.8.6 splines_4.0.2
> [5] listenv_0.8.0 tfruns_1.4 usethis_1.6.3 digest_0.6.27
> [9] htmltools_0.5.0 fansi_0.4.1 memoise_1.1.0 remotes_2.1.1
> [13] globals_0.13.1 modelr_0.1.8 gower_0.2.2 prettyunits_1.1.1
> [17] colorspace_1.4-1 blob_1.2.1 rvest_0.3.6 haven_2.3.1
> [21] xfun_0.17 callr_3.5.1 crayon_1.3.4 RCurl_1.98-1.2
> [25] jsonlite_1.7.1 zeallot_0.1.0 survival_3.1-12 flock_0.7
> [29] glue_1.4.2 gtable_0.3.0 ipred_0.9-9 pkgbuild_1.1.0
> [33] DBI_1.1.0 Rcpp_1.0.5 GPfit_1.0-8 bit_4.0.4
> [37] lava_1.6.8 prodlim_2019.11.13 httr_1.4.2 ellipsis_0.3.1
> [41] pkgconfig_2.0.3 nnet_7.3-14 dbplyr_1.4.4 utf8_1.1.4
> [45] tidyselect_1.1.0 rlang_0.4.8 DiceDesign_1.8-1 munsell_0.5.0
> [49] cellranger_1.1.0 tools_4.0.2 cli_2.1.0 generics_0.1.0
> [53] RSQLite_2.2.1 devtools_2.3.0 evaluate_0.14 yaml_2.2.1
> [57] processx_3.4.4 knitr_1.29.4 bit64_4.0.5 fs_1.5.0
> [61] future_1.19.1 whisker_0.4 xml2_1.3.2 compiler_4.0.2
> [65] rstudioapi_0.11 testthat_2.3.2 reprex_0.3.0 lhs_1.1.1
> [69] stringi_1.5.3 highr_0.8 ps_1.4.0 desc_1.2.0
> [73] lattice_0.20-41 tensorflow_2.2.0 vctrs_0.3.4 pillar_1.4.6
> [77] lifecycle_0.2.0 furrr_0.2.1 bitops_1.0-6 R6_2.5.0
> [81] renv_0.12.0-12 sessioninfo_1.1.1 codetools_0.2-16 MASS_7.3-51.6
> [85] assertthat_0.2.1 pkgload_1.1.0 rprojroot_1.3-2 withr_2.3.0
> [89] hms_0.5.3 grid_4.0.2 rpart_4.1-15 timeDate_3043.102
> [93] class_7.3-17 rmarkdown_2.5 ingredients_2.0 pROC_1.16.2
> [97] lubridate_1.7.9 base64enc_0.1-3
Created on 2020-11-04 by the reprex package (v0.3.0)