using drwhy with tidyflow

gutama commented 3 years ago

Hi

and I want to use tidymodel but I also want to use tidyflow, I try it but it doesn't work. Do I have to convert tidyflow to workflow or is there an easier way?

Thank you.

with kind regards, ginanjar

cimentadaj commented 3 years ago

tidyflow is just a wrapper around workflows so we can just run a tidyflow and pipe the parsnip model to modelStudio:

library(tidyflow)
library(tidymodels)
#> ── Attaching packages ───────────────────────────────── tidymodels 0.1.1.9000 ──
#> ✔ broom     0.7.1          ✔ recipes   0.1.13    
#> ✔ dials     0.0.9.9000     ✔ rsample   0.0.8.9000
#> ✔ dplyr     1.0.4          ✔ tibble    3.0.6.9000
#> ✔ ggplot2   3.3.3          ✔ tidyr     1.1.2     
#> ✔ infer     0.5.3          ✔ tune      0.1.1.9000
#> ✔ modeldata 0.0.2          ✔ workflows 0.2.0.9000
#> ✔ parsnip   0.1.4          ✔ yardstick 0.0.7     
#> ✔ purrr     0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> ✖ purrr::discard() masks scales::discard()
#> ✖ dplyr::filter()  masks stats::filter()
#> ✖ dplyr::lag()     masks stats::lag()
#> ✖ recipes::step()  masks stats::step()
library(DALEX)
#> Welcome to DALEX (version: 2.0.1).
#> Find examples and detailed introduction at: https://pbiecek.github.io/ema/
#> Additional features will be available after installation of: ggpubr.
#> Use 'install_dependencies()' to get all suggested dependencies
#> 
#> Attaching package: 'DALEX'
#> The following object is masked from 'package:dplyr':
#> 
#>     explain
library(modelStudio)

test_train<- mtcars %>% initial_split()
train_df <- training(test_train)
test_df <- testing(test_train)

mod1 <- rand_forest(mode = "regression") %>% set_engine("ranger")

res <-
  train_df %>%
  tidyflow() %>%
  plug_formula(mpg ~ .) %>%
  plug_model(mod1) %>%
  fit()

# create an explainer for the model
explainer <- explain(res %>% pull_tflow_fit(),
                     data = test_df,
                     y = test_df$mpg,
                     label = "parsnip")
#> Preparation of a new explainer is initiated
#>   -> model label       :  parsnip 
#>   -> data              :  8  rows  11  cols 
#>   -> target variable   :  8  values 
#>   -> predict function  :  yhat.model_fit  will be used ( [33m default [39m )
#>   -> predicted values  :  numerical, min =  12.95459 , mean =  20.42622 , max =  27.48106  
#>   -> model_info        :  package parsnip , ver. 0.1.4 , task regression ( [33m default [39m ) 
#>   -> residual function :  difference between y and yhat ( [33m default [39m )
#>   -> residuals         :  numerical, min =  -2.554595 , mean =  1.823778 , max =  6.292993  
#>  [32m A new explainer has been created! [39m

# make a studio for the model
modelStudio(explainer)
#> `new_observation` argument is NULL. `new_observation_n` observations needed to calculate local explanations are taken from the data.

gutama commented 3 years ago

Hi, thanks for the explanation

I am trying to use cv

cv_mod <-  rand_forest(mode = "regression",trees = tune(), mtry = tune(), min_n = tune()) %>% set_engine("ranger")

res2 <-
  train_df %>%
  tidyflow() %>%
  plug_formula(mpg ~ .) %>%
  plug_model(cv_mod) %>%
  plug_resample(vfold_cv) %>% # Specify resample: cross-validation
  plug_grid(grid_regular) %>% 
  fit()

best <- res2 %>% complete_tflow(metric = "rmse")

explainer <- explain(best$fit,
                     data = test_df,
                     y = test_df$mpg,
                     label = "parsnip")

but if i use cross validation, the output is

Show in New Window Preparation of a new explainer is initiated -> model label : parsnip -> data : 8 rows 11 cols -> target variable : 8 values -> predict function : yhat.default will be used ( default ) -> predicted values : No value for predict function target column. ( default ) -> model_info : package Model of class: stage_fit package unrecognized , ver. Unknown , task regression ( default ) -> predicted values : the predict_function returns an error when executed ( WARNING ) -> residual function : difference between y and yhat ( default ) -> residuals : the residual_function returns an error when executed ( WARNING ) A new explainer has been created!

how to extract the final model?

gutama commented 3 years ago

Seems I did it the wrong way

best_flow <- res2 %>% complete_tflow(metric = "rmse")

explainer <- explain(pull_tflow_fit(best_flow),
                     data = test_df,
                     y = test_df$mpg,
                     label = "parsnip")

the output is now

Preparation of a new explainer is initiated -> model label : parsnip -> data : 8 rows 11 cols -> target variable : 8 values -> predict function : yhat.model_fit will be used ( default ) -> predicted values : No value for predict function target column. ( default ) -> model_info : package parsnip , ver. 0.1.5 , task regression ( default ) -> predicted values : numerical, min = 12.79456 , mean = 20.11334 , max = 27.60665
-> residual function : difference between y and yhat ( default ) -> residuals : numerical, min = -2.394563 , mean = 1.199156 , max = 5.69395
A new explainer has been created

Thank you

gutama commented 3 years ago

Hi Jorge,

It seems I can't just pipe 'parsnip' into the 'modelStudio;

because the recipe is not processed

train_data <- pull_tflow_training(final_model, prep=TRUE) 

explainer <- explain(pull_tflow_fit(final_model),
                     data = train_data  %>% select(-gdprl),
                     y = train_data$gdprl,
                     label = "parsnip", predict_function = function(x, y){predict(x, new_data = y) %>% pull(.pred)})

So I need to wrap into: (I took it from dalextest)

custom_predict <- function(X.model, newdata, positive_value) { 
  if (X.model$fit$fit$fit$spec$mode == "classification") {
      response <- as.matrix(predict(X.model, newdata, type = "prob"))
      colnames(response) <- X.model$fit$fit$fit$lvl
      if (ncol(response) == 2) {
        response <- response[,2]
      }
  }
  else if (X.model$fit$fit$fit$spec$mode == "regression") {
      pred <- predict(X.model, newdata)
      response <- pred$.pred
    }
  else {
      stop("Mode specification has to be either classification or regression")
    }
      return(response) 
  }

custom_data_expl <- function(recipe_workflow, dataset, target_variable) { 
                                                                          data_return <- as.data.frame(prep(recipe_workflow$pre$actions$recipe$recipe_res, dataset) %>% bake(dataset) %>% select(-target_variable))
                                                                          return(data_return)
                                                                        }

custom_y_expl <- function(recipe_workflow, dataset, target_variable) { 
                                                                      data_return <- prep(recipe_workflow$pre$actions$recipe$recipe_res, dataset) %>% bake(dataset) %>% mutate(target_variable = ifelse(target_variable == 'pos', 1, 0))  %>% pull(target_variable)
                                                                      return(data_return)
                                                                     }
custom_model_expl <- function(recipe_workflow) {return(recipe_workflow$fit$fit)}

custom_new_obs <- function(recipe_workflow, dataset, target_variable, rownumber) {
                              new_obs <- as.data.frame(prep(recipe_workflow$pre$actions$recipe$recipe_res, dataset) %>% bake(dataset) %>% select(-target_variable))[rownumber,]
                              return(new_obs)
}

train_data <- pull_tflow_training(final_model)                             

explainer_xgboost <- DALEX::explain(
  model = custom_model_expl(final_model),
  data = custom_data_expl(final_model, train_data, "gdprl"),
  y = custom_y_expl(final_model, train_data, "gdprl"),
  predict_function = custom_predict,
  label = "xgboost")

But it doesn't work :(

lucky there is an easy way if we use DALEXtra

library(DALEXtra)

explainer <- explain_tidymodels(final_model$fit$fit$wflow, data = train_data  %>% select(-gdprl),
                     y = train_data$gdprl, label = "xgboost" )

now it works

I do have questions

why tidyflow doesn't have convenience method to pull original workflow?
why it is not working like tibble and dataframe, I mean a tidyflow should be also a workflow, I am not an R expert btw

Thank you

cimentadaj / tidyflow

using drwhy with tidyflow #31