pbiecek / archivist

A set of tools for datasets and plots archiving
http://pbiecek.github.io/archivist/
74 stars 9 forks source link

Explainers build on keras model - fail #338

Open PaulCristina opened 4 years ago

PaulCristina commented 4 years ago

Explainers build using keras model are not saved correct and fail with the message:

> Error: object of type 'closure' is not subsettable

renv::init()

Sys.setenv(RETICULATE_PYTHON = "/home/paulc/anaconda3/envs/polynote/bin/python") Sys.setenv(TENSORFLOW_PYTHON = "/home/paulc/anaconda3/envs/polynote/bin/python")

1.0 LOAD LIBRARY

suppressMessages(library(tidyverse)) suppressMessages(library(data.table))

> Warning: package 'data.table' was built under R version 4.0.3

suppressMessages(library(Matrix)) suppressMessages(library(matrixStats)) suppressMessages(library(magrittr)) suppressMessages(library(here)) suppressMessages(library(archivist)) suppressMessages(library(recipes)) suppressMessages(library(yardstick)) suppressMessages(library(tidymodels))

> Warning: package 'broom' was built under R version 4.0.3

> Warning: package 'modeldata' was built under R version 4.0.3

> Warning: package 'parsnip' was built under R version 4.0.3

suppressMessages(library(reticulate)) suppressMessages(library(keras)) suppressMessages(library(tune)) suppressMessages(library(workflows)) suppressMessages(library(foreach)) suppressMessages(library(doParallel)) suppressMessages(library(doSNOW)) suppressMessages(library(parallel)) suppressMessages(library(DALEX))

> Warning: package 'DALEX' was built under R version 4.0.3

2.0 LOAD DATA

df.data <- read.csv("https://raw.githubusercontent.com/treselle-systems/customer_churn_analysis/master/WA_Fn-UseC_-Telco-Customer-Churn.csv") glimpse(df.data)

> Rows: 7,043

> Columns: 21

> $ customerID "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOC…

> $ gender "Female", "Male", "Male", "Male", "Female", "Female"…

> $ SeniorCitizen 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

> $ Partner "Yes", "No", "No", "No", "No", "No", "No", "No", "Ye…

> $ Dependents "No", "No", "No", "No", "No", "No", "Yes", "No", "No…

> $ tenure 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, …

> $ PhoneService "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No",…

> $ MultipleLines "No phone service", "No", "No", "No phone service", …

> $ InternetService "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber op…

> $ OnlineSecurity "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", …

> $ OnlineBackup "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "…

> $ DeviceProtection "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "…

> $ TechSupport "No", "No", "No", "Yes", "No", "No", "No", "No", "Ye…

> $ StreamingTV "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Y…

> $ StreamingMovies "No", "No", "No", "No", "No", "Yes", "No", "No", "Ye…

> $ Contract "Month-to-month", "One year", "Month-to-month", "One…

> $ PaperlessBilling "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No",…

> $ PaymentMethod "Electronic check", "Mailed check", "Mailed check", …

> $ MonthlyCharges 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.…

> $ TotalCharges 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 194…

> $ Churn "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "…

Make copy of df.data and drop the unneeded columns

data_set <- df.data %>% dplyr::select(-"customerID")

Rename the outcome variable (Churn in my case) to Target

data_in_scope <- data_set %>% plyr::rename(c("Churn" = "Target"))

Drop rows with missing value(11 missing values, very small percentage of our total data)

data_in_scope <- data_set %>% plyr::rename(c("Churn" = "Target")) %>% drop_na()

Split data into train and test data and create resamples for tuning

set.seed(2020) train_test_split_data <- initial_split(data_in_scope) data_in_scope_train <- training(train_test_split_data) data_in_scope_test <- testing(train_test_split_data)

Pre-Processing the data with{recipes}

set.seed(2020) rec <- recipe(Target ~., data = data_in_scope_train) %>% # Fomula step_dummy(all_nominal(), -Target) %>% # convert nominal data into one or more numeric. step_corr(all_predictors()) %>% # remove variables that have large absolute

correlations with other variables.

step_center(all_numeric(), -all_outcomes()) %>% # normalize numeric data to have a mean of zero. step_scale(all_numeric(), -all_outcomes()) # normalize numeric data to have a standard deviation of one.

trained_rec <- prep(rec, training = data_in_scope_train, retain = TRUE)

create the train and test set

train_data <- as.data.frame(juice(trained_rec)) test_data <- as.data.frame( bake(trained_rec, new_data = data_in_scope_test))

binary response

yTrain <- as.integer(ifelse(train_data$Target == "Yes", 1, 0)) yTest <- as.integer(ifelse(test_data$Target == "Yes", 1, 0))

predictors

train_X <- train_data[,-5] %>% as.matrix() test_X <- test_data[,-5] %>% as.matrix()

n_features <- ncol(train_X)

path to models

path <- "/home/paulc/projects_Paul/teste/archivist/models" file <- "keras_model.hdf5"

epochs <- 100 batch_size <- 64

3.0 CALLBACK'S

callbacks_list <- list( callback_early_stopping( monitor = "val_loss", patience = 10, min_delta = 0 ), callback_model_checkpoint( filepath = paste0(path, "/", file), monitor = "val_loss", save_best_only = TRUE ), callback_reduce_lr_on_plateau( monitor = "val_loss", factor = 0.1, patience = 2 ), callback_lambda( on_epoch_end = function(epoch, logs) { if (epoch %% 10 == 0) cat("\n") cat(".") } )

)

4.0 RUN MODEL

create_model <- function(Learning_rate = 1e-3, decay = 0){

KERAS clear session

k_clear_session()

set seed TF1

use_session_with_seed(2020, disable_gpu = TRUE, disable_parallel_cpu = FALSE) # seed: for reproducible research

Initialize sequential model

model <- keras_model_sequential() %>%

# First hidden layer
layer_dense(
  units              = 256,
  kernel_initializer = "uniform",
  activation         = "relu",
  input_shape        = n_features) %>%

# Dropout to prevent overfitting
layer_dropout(rate = 0.3) %>%

layer_gaussian_noise(0.1) %>%
layer_dropout(rate = 0.3) %>%

# Second hidden layer
layer_dense(
  units              = 64,
  kernel_initializer = "uniform",
  activation         = "relu") %>%

# Dropout to prevent overfitting
layer_dropout(rate = 0.3) %>%

# Second hidden layer
layer_dense(
  units              = 32,
  kernel_initializer = "uniform",
  activation         = "relu") %>%

# Dropout to prevent overfitting
layer_dropout(rate = 0.2) %>%

# Output layer
layer_dense(
  units              = 1,
  kernel_initializer = "uniform",
  activation         = "sigmoid") %>%

# Compile ANN
compile(
  optimizer = optimizer_adam(lr = Learning_rate, decay = decay), # default if NULL lr=0.001,
  loss      = 'binary_crossentropy',
  metrics   = c('accuracy')
)

return(model) } create_model()

> Set session seed to 2020 (disabled GPU)

> Model

> Model: "sequential"

> ____

> Layer (type) Output Shape Param

> ================================================================================

> dense (Dense) (None, 256) 6144

> ____

> dropout (Dropout) (None, 256) 0

> ____

> gaussian_noise (GaussianNoise) (None, 256) 0

> ____

> dropout_1 (Dropout) (None, 256) 0

> ____

> dense_1 (Dense) (None, 64) 16448

> ____

> dropout_2 (Dropout) (None, 64) 0

> ____

> dense_2 (Dense) (None, 32) 2080

> ____

> dropout_3 (Dropout) (None, 32) 0

> ____

> dense_3 (Dense) (None, 1) 33

> ================================================================================

> Total params: 24,705

> Trainable params: 24,705

> Non-trainable params: 0

> ____

final_model <- create_model()

> Set session seed to 2020 (disabled GPU)

Train model

history <- final_model %>% fit( x = train_X , y = yTrain , epochs = epochs , batch_size = batch_size , validation_split = 0.2 , shuffle = TRUE , verbose = TRUE ,callbacks = callbacks_list )

>

> ..........

> ..........

> .

Save model

loaded_model <- load_model_hdf5(paste0(path, "/", file)) loaded_model

> Model

> Model: "sequential"

> ____

> Layer (type) Output Shape Param

> ================================================================================

> dense (Dense) (None, 256) 6144

> ____

> dropout (Dropout) (None, 256) 0

> ____

> gaussian_noise (GaussianNoise) (None, 256) 0

> ____

> dropout_1 (Dropout) (None, 256) 0

> ____

> dense_1 (Dense) (None, 64) 16448

> ____

> dropout_2 (Dropout) (None, 64) 0

> ____

> dense_2 (Dense) (None, 32) 2080

> ____

> dropout_3 (Dropout) (None, 32) 0

> ____

> dense_3 (Dense) (None, 1) 33

> ================================================================================

> Total params: 24,705

> Trainable params: 24,705

> Non-trainable params: 0

> ____

5.0 LOCAL GIT

loka_repo <- "/home/paulc/projects_Paul/teste/archivist/repo" setLocalRepo(loka_repo)

6.0 EXPLAINERS

df.test <- test_data %>% select(-Target)

Prediction functions

custom_predict_keras <- function(object, newdata) {pred <- as.data.frame(predict_proba(object, newdata)) response <- pred$V1 return(response)}

keras_explainer <- DALEX::explain(model = keras_model, data = as.matrix(df.test), y = yTest, predict_function = custom_predict_keras, label = "Neural Network model", colorize = FALSE)%a% asave(repoDir = loka_repo, userTags = c("tags", "keras_explainer"))

> Preparation of a new explainer is initiated

> -> model label : Neural Network model

> -> data : 1758 rows 23 cols

> -> data : rownames to data was added ( from 1 to 1758 )

> -> target variable : 1758 values

> -> predict function : custom_predict_keras

> -> predicted values : the predict_function returns an error when executed ( WARNING )

> -> model_info : package Model of class: function package unrecognized , ver. Unknown , task regression ( default )

> -> residual function : difference between y and yhat ( default )

> -> residuals : the residual_function returns an error when executed ( WARNING )

> A new explainer has been created!

searchInLocalRepo(pattern = "keras_explainer")

> [1] "5ff6baacb6af009334b549035eb9967c" "3d730e3b1cb8111a91632fbb2faad6ee"

> [3] "57f922abdb583dbe113200a37f1188ca"

keras_explainer <- loadFromLocalRepo(md5hash = '3d730e3b1cb8111a91632fbb2faad6ee', value = TRUE) keras_explainer

> Model label: Neural Network model

> Model class: function

> Data head :

> SeniorCitizen tenure MonthlyCharges TotalCharges gender_Male Partner_Yes

> 1 -0.4410523 -0.9121967 -1.171850 -0.8754606 -1.005609 -0.9725976

> 2 -0.4410523 -0.1814047 1.330267 0.3316549 -1.005609 1.0279795

> Dependents_Yes MultipleLines_No.phone.service MultipleLines_Yes

> 1 -0.6512835 2.9915291 -0.8492012

> 2 -0.6512835 -0.3342138 1.1773540

> InternetService_Fiber.optic OnlineSecurity_Yes OnlineBackup_Yes

> 1 -0.8888612 1.5728443 -0.7328433

> 2 1.1248218 -0.6356703 -0.7328433

> DeviceProtection_Yes TechSupport_Yes StreamingTV_Yes

> 1 -0.7307052 -0.6389055 -0.7890825

> 2 1.3682814 1.5648798 1.2670544

> StreamingMovies_No.internet.service StreamingMovies_Yes Contract_One.year

> 1 -0.5186113 -0.7998775 -0.5141934

> 2 -0.5186113 1.2499544 -0.5141934

> Contract_Two.year PaperlessBilling_Yes PaymentMethod_Credit.card..automatic.

> 1 -0.5602159 -1.2048268 -0.5288983

> 2 -0.5602159 0.8298375 -0.5288983

> PaymentMethod_Electronic.check PaymentMethod_Mailed.check

> 1 -0.706135 1.8453643

> 2 1.415891 -0.5417957

vi_keras <- model_parts(explainer = keras_explainer, loss_function = loss_one_minus_auc, type = "difference", B = 10, N = NULL)

> Error: object of type 'closure' is not subsettable

sessionInfo()

> R version 4.0.2 (2020-06-22)

> Platform: x86_64-pc-linux-gnu (64-bit)

> Running under: CentOS Linux 7 (Core)

>

> Matrix products: default

> BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.3.so

>

> locale:

> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C

> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8

> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8

> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C

> [9] LC_ADDRESS=C LC_TELEPHONE=C

> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

>

> attached base packages:

> [1] parallel stats graphics grDevices datasets utils methods

> [8] base

>

> other attached packages:

> [1] DALEX_2.0.1 doSNOW_1.0.19 snow_0.4-3 doParallel_1.0.16

> [5] iterators_1.0.13 foreach_1.5.1 keras_2.3.0.0 reticulate_1.18

> [9] workflows_0.2.1 tune_0.1.1 rsample_0.0.8 parsnip_0.1.4

> [13] modeldata_0.1.0 infer_0.5.3 dials_0.0.9 scales_1.1.1

> [17] broom_0.7.2 tidymodels_0.1.1 yardstick_0.0.7 recipes_0.1.14

> [21] archivist_2.3.4 here_0.1 magrittr_1.5 matrixStats_0.57.0

> [25] Matrix_1.2-18 data.table_1.13.2 forcats_0.5.0 stringr_1.4.0

> [29] dplyr_1.0.2 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2

> [33] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0

>

> loaded via a namespace (and not attached):

> [1] readxl_1.3.1 backports_1.1.10 plyr_1.8.6 splines_4.0.2

> [5] listenv_0.8.0 tfruns_1.4 usethis_1.6.3 digest_0.6.27

> [9] htmltools_0.5.0 fansi_0.4.1 memoise_1.1.0 remotes_2.1.1

> [13] globals_0.13.1 modelr_0.1.8 gower_0.2.2 prettyunits_1.1.1

> [17] colorspace_1.4-1 blob_1.2.1 rvest_0.3.6 haven_2.3.1

> [21] xfun_0.17 callr_3.5.1 crayon_1.3.4 RCurl_1.98-1.2

> [25] jsonlite_1.7.1 zeallot_0.1.0 survival_3.1-12 flock_0.7

> [29] glue_1.4.2 gtable_0.3.0 ipred_0.9-9 pkgbuild_1.1.0

> [33] DBI_1.1.0 Rcpp_1.0.5 GPfit_1.0-8 bit_4.0.4

> [37] lava_1.6.8 prodlim_2019.11.13 httr_1.4.2 ellipsis_0.3.1

> [41] pkgconfig_2.0.3 nnet_7.3-14 dbplyr_1.4.4 utf8_1.1.4

> [45] tidyselect_1.1.0 rlang_0.4.8 DiceDesign_1.8-1 munsell_0.5.0

> [49] cellranger_1.1.0 tools_4.0.2 cli_2.1.0 generics_0.1.0

> [53] RSQLite_2.2.1 devtools_2.3.0 evaluate_0.14 yaml_2.2.1

> [57] processx_3.4.4 knitr_1.29.4 bit64_4.0.5 fs_1.5.0

> [61] future_1.19.1 whisker_0.4 xml2_1.3.2 compiler_4.0.2

> [65] rstudioapi_0.11 testthat_2.3.2 reprex_0.3.0 lhs_1.1.1

> [69] stringi_1.5.3 highr_0.8 ps_1.4.0 desc_1.2.0

> [73] lattice_0.20-41 tensorflow_2.2.0 vctrs_0.3.4 pillar_1.4.6

> [77] lifecycle_0.2.0 furrr_0.2.1 bitops_1.0-6 R6_2.5.0

> [81] renv_0.12.0-12 sessioninfo_1.1.1 codetools_0.2-16 MASS_7.3-51.6

> [85] assertthat_0.2.1 pkgload_1.1.0 rprojroot_1.3-2 withr_2.3.0

> [89] hms_0.5.3 grid_4.0.2 rpart_4.1-15 timeDate_3043.102

> [93] class_7.3-17 rmarkdown_2.5 ingredients_2.0 pROC_1.16.2

> [97] lubridate_1.7.9 base64enc_0.1-3

Created on 2020-11-04 by the reprex package (v0.3.0)

pbiecek commented 3 years ago

@maksymiuks any idea?