Closed gutama closed 3 years ago
tidyflow
is just a wrapper around workflows
so we can just run a tidyflow
and pipe the parsnip
model to modelStudio
:
library(tidyflow)
library(tidymodels)
#> ── Attaching packages ───────────────────────────────── tidymodels 0.1.1.9000 ──
#> ✔ broom 0.7.1 ✔ recipes 0.1.13
#> ✔ dials 0.0.9.9000 ✔ rsample 0.0.8.9000
#> ✔ dplyr 1.0.4 ✔ tibble 3.0.6.9000
#> ✔ ggplot2 3.3.3 ✔ tidyr 1.1.2
#> ✔ infer 0.5.3 ✔ tune 0.1.1.9000
#> ✔ modeldata 0.0.2 ✔ workflows 0.2.0.9000
#> ✔ parsnip 0.1.4 ✔ yardstick 0.0.7
#> ✔ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> ✖ purrr::discard() masks scales::discard()
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ recipes::step() masks stats::step()
library(DALEX)
#> Welcome to DALEX (version: 2.0.1).
#> Find examples and detailed introduction at: https://pbiecek.github.io/ema/
#> Additional features will be available after installation of: ggpubr.
#> Use 'install_dependencies()' to get all suggested dependencies
#>
#> Attaching package: 'DALEX'
#> The following object is masked from 'package:dplyr':
#>
#> explain
library(modelStudio)
test_train<- mtcars %>% initial_split()
train_df <- training(test_train)
test_df <- testing(test_train)
mod1 <- rand_forest(mode = "regression") %>% set_engine("ranger")
res <-
train_df %>%
tidyflow() %>%
plug_formula(mpg ~ .) %>%
plug_model(mod1) %>%
fit()
# create an explainer for the model
explainer <- explain(res %>% pull_tflow_fit(),
data = test_df,
y = test_df$mpg,
label = "parsnip")
#> Preparation of a new explainer is initiated
#> -> model label : parsnip
#> -> data : 8 rows 11 cols
#> -> target variable : 8 values
#> -> predict function : yhat.model_fit will be used ( [33m default [39m )
#> -> predicted values : numerical, min = 12.95459 , mean = 20.42622 , max = 27.48106
#> -> model_info : package parsnip , ver. 0.1.4 , task regression ( [33m default [39m )
#> -> residual function : difference between y and yhat ( [33m default [39m )
#> -> residuals : numerical, min = -2.554595 , mean = 1.823778 , max = 6.292993
#> [32m A new explainer has been created! [39m
# make a studio for the model
modelStudio(explainer)
#> `new_observation` argument is NULL. `new_observation_n` observations needed to calculate local explanations are taken from the data.
Hi, thanks for the explanation
I am trying to use cv
cv_mod <- rand_forest(mode = "regression",trees = tune(), mtry = tune(), min_n = tune()) %>% set_engine("ranger")
res2 <-
train_df %>%
tidyflow() %>%
plug_formula(mpg ~ .) %>%
plug_model(cv_mod) %>%
plug_resample(vfold_cv) %>% # Specify resample: cross-validation
plug_grid(grid_regular) %>%
fit()
best <- res2 %>% complete_tflow(metric = "rmse")
explainer <- explain(best$fit,
data = test_df,
y = test_df$mpg,
label = "parsnip")
but if i use cross validation, the output is
Show in New Window Preparation of a new explainer is initiated -> model label : parsnip -> data : 8 rows 11 cols -> target variable : 8 values -> predict function : yhat.default will be used ( default ) -> predicted values : No value for predict function target column. ( default ) -> model_info : package Model of class: stage_fit package unrecognized , ver. Unknown , task regression ( default ) -> predicted values : the predict_function returns an error when executed ( WARNING ) -> residual function : difference between y and yhat ( default ) -> residuals : the residual_function returns an error when executed ( WARNING ) A new explainer has been created!
how to extract the final model?
Seems I did it the wrong way
best_flow <- res2 %>% complete_tflow(metric = "rmse")
explainer <- explain(pull_tflow_fit(best_flow),
data = test_df,
y = test_df$mpg,
label = "parsnip")
the output is now
Preparation of a new explainer is initiated
-> model label : parsnip
-> data : 8 rows 11 cols
-> target variable : 8 values
-> predict function : yhat.model_fit will be used ( default )
-> predicted values : No value for predict function target column. ( default )
-> model_info : package parsnip , ver. 0.1.5 , task regression ( default )
-> predicted values : numerical, min = 12.79456 , mean = 20.11334 , max = 27.60665
-> residual function : difference between y and yhat ( default )
-> residuals : numerical, min = -2.394563 , mean = 1.199156 , max = 5.69395
A new explainer has been created
Thank you
Hi Jorge,
It seems I can't just pipe 'parsnip' into the 'modelStudio;
because the recipe is not processed
train_data <- pull_tflow_training(final_model, prep=TRUE)
explainer <- explain(pull_tflow_fit(final_model),
data = train_data %>% select(-gdprl),
y = train_data$gdprl,
label = "parsnip", predict_function = function(x, y){predict(x, new_data = y) %>% pull(.pred)})
So I need to wrap into: (I took it from dalextest)
custom_predict <- function(X.model, newdata, positive_value) {
if (X.model$fit$fit$fit$spec$mode == "classification") {
response <- as.matrix(predict(X.model, newdata, type = "prob"))
colnames(response) <- X.model$fit$fit$fit$lvl
if (ncol(response) == 2) {
response <- response[,2]
}
}
else if (X.model$fit$fit$fit$spec$mode == "regression") {
pred <- predict(X.model, newdata)
response <- pred$.pred
}
else {
stop("Mode specification has to be either classification or regression")
}
return(response)
}
custom_data_expl <- function(recipe_workflow, dataset, target_variable) {
data_return <- as.data.frame(prep(recipe_workflow$pre$actions$recipe$recipe_res, dataset) %>% bake(dataset) %>% select(-target_variable))
return(data_return)
}
custom_y_expl <- function(recipe_workflow, dataset, target_variable) {
data_return <- prep(recipe_workflow$pre$actions$recipe$recipe_res, dataset) %>% bake(dataset) %>% mutate(target_variable = ifelse(target_variable == 'pos', 1, 0)) %>% pull(target_variable)
return(data_return)
}
custom_model_expl <- function(recipe_workflow) {return(recipe_workflow$fit$fit)}
custom_new_obs <- function(recipe_workflow, dataset, target_variable, rownumber) {
new_obs <- as.data.frame(prep(recipe_workflow$pre$actions$recipe$recipe_res, dataset) %>% bake(dataset) %>% select(-target_variable))[rownumber,]
return(new_obs)
}
train_data <- pull_tflow_training(final_model)
explainer_xgboost <- DALEX::explain(
model = custom_model_expl(final_model),
data = custom_data_expl(final_model, train_data, "gdprl"),
y = custom_y_expl(final_model, train_data, "gdprl"),
predict_function = custom_predict,
label = "xgboost")
But it doesn't work :(
lucky there is an easy way if we use DALEXtra
library(DALEXtra)
explainer <- explain_tidymodels(final_model$fit$fit$wflow, data = train_data %>% select(-gdprl),
y = train_data$gdprl, label = "xgboost" )
now it works
I do have questions
Thank you
Hi
I just found modelstudio
and I want to use tidymodel but I also want to use tidyflow, I try it but it doesn't work. Do I have to convert tidyflow to workflow or is there an easier way?
Thank you.
with kind regards, ginanjar