rstudio / vetiver-r

Version, share, deploy, and monitor models
https://rstudio.github.io/vetiver-r/
Other
183 stars 27 forks source link

Error when creating vetiver model from a tidymodels-tuned xgboost model #153

Closed datadavidz closed 2 years ago

datadavidz commented 2 years ago

I am getting: "Error in xgb.get.handle(model) : invalid xgb.Booster.handle" when using the vetiver_model function on a tuned xgboost model (tune then finalize wf). I don't get the error if I take out the tune/finalize steps. I had the error on my data but also the same error when trying similar steps with the mtcars data:

library(tidyverse) library(tidymodels) library(xgboost) library(vetiver)

cars <- as_tibble(mtcars)

set.seed(123) cars_split <- initial_split(cars) cars_train <- training(cars_split) cars_test <- testing(cars_split)

cars_rec <- recipe(mpg ~ ., data = cars_train) %>% step_normalize(all_predictors())

set.seed(234) cars_folds <- vfold_cv(cars_train, v = 5)

xgb_wf <- workflow() %>% add_recipe(cars_rec) %>% add_model(xgb_spec)

xgboost_grid <- grid_regular(trees())

set.seed(456)

xgboost_res <- tune_grid( xgb_wf, resamples = cars_folds, grid = xgboost_grid, metrics = metric_set(rmse, rsq, mae), control = control_grid(save_pred = TRUE) )

params_xgboost_best <- xgboost_res %>% select_best("mae")

final_xgboost <- finalize_workflow(xgb_wf, params_xgboost_best)

final_res <- last_fit(final_xgboost, cars_split)

v <- final_res %>% extract_workflow() %>% vetiver_model(model_name = "cars-xgb")

Error in xgb.get.handle(model) : invalid xgb.Booster.handle

juliasilge commented 2 years ago

Hmmmm, I am not able to reproduce this problem, with either the current CRAN or GitHub versions of vetiver:

library(tidymodels)
library(vetiver)
#> 
#> Attaching package: 'vetiver'
#> The following object is masked from 'package:tune':
#> 
#>     load_pkgs

set.seed(123)
cars_split <- initial_split(mtcars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)

set.seed(234)
cars_folds <- vfold_cv(cars_train, v = 5)

cars_rec <- recipe(mpg ~ ., data = cars_train) %>%
  step_normalize(all_predictors())
xgb_spec <- boost_tree(mode = "regression", trees = tune())
xgb_wf <- workflow(cars_rec, xgb_spec)

xgboost_grid <- grid_regular(trees())

set.seed(456)

xgboost_res <- tune_grid(
  xgb_wf,
  resamples = cars_folds,
  grid = xgboost_grid,
  metrics = metric_set(rmse, rsq, mae),
  control = control_grid(save_pred = TRUE)
)

params_xgboost_best <- xgboost_res %>% select_best("mae")
final_xgboost <- finalize_workflow(xgb_wf, params_xgboost_best)
final_res <- last_fit(final_xgboost, cars_split)

v <- final_res %>%
  extract_workflow() %>%
  vetiver_model(model_name = "cars-xgb")

v
#> 
#> ── cars-xgb ─ <bundled_workflow> model for deployment 
#> A xgboost regression modeling workflow using 10 features

Created on 2022-10-17 with reprex v2.0.2

Could you update your reproducible example to use the reprex package? Using reprex makes it easier to see both the input and output, and for us to re-run the code in a local session. Thanks! 🙌

datadavidz commented 2 years ago

I've narrowed it down to after I run some code with doParallel::registerDoParallel() then I seem to get the error running the vetiver_model() function until I begin a fresh session. Sorry, I didn't include this in this statement in the previous code.

library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.1
#> Warning: package 'ggplot2' was built under R version 4.2.1
#> Warning: package 'tibble' was built under R version 4.2.1
#> Warning: package 'tidyr' was built under R version 4.2.1
#> Warning: package 'readr' was built under R version 4.2.1
#> Warning: package 'purrr' was built under R version 4.2.1
#> Warning: package 'stringr' was built under R version 4.2.1
#> Warning: package 'forcats' was built under R version 4.2.1
library(tidymodels)
#> Warning: package 'tidymodels' was built under R version 4.2.1
#> Warning: package 'broom' was built under R version 4.2.1
#> Warning: package 'dials' was built under R version 4.2.1
#> Warning: package 'scales' was built under R version 4.2.1
#> Warning: package 'infer' was built under R version 4.2.1
#> Warning: package 'modeldata' was built under R version 4.2.1
#> Warning: package 'parsnip' was built under R version 4.2.1
#> Warning: package 'recipes' was built under R version 4.2.1
#> Warning: package 'rsample' was built under R version 4.2.1
#> Warning: package 'tune' was built under R version 4.2.1
#> Warning: package 'workflows' was built under R version 4.2.1
#> Warning: package 'workflowsets' was built under R version 4.2.1
#> Warning: package 'yardstick' was built under R version 4.2.1
library(xgboost)
#> Warning: package 'xgboost' was built under R version 4.2.1
#> 
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#> 
#>     slice
library(vetiver)
#> Warning: package 'vetiver' was built under R version 4.2.1
#> 
#> Attaching package: 'vetiver'
#> The following object is masked from 'package:tune':
#> 
#>     load_pkgs

cars <- as_tibble(mtcars)

set.seed(123)
cars_split <- initial_split(cars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)

xgb_spec <- boost_tree(
  trees = tune() 
) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

cars_rec <- recipe(mpg ~ ., data = cars_train) %>%
  step_normalize(all_predictors())

set.seed(234)
cars_folds <- vfold_cv(cars_train, v = 5)

xgb_wf <- workflow() %>%
  add_recipe(cars_rec) %>%
  add_model(xgb_spec)

xgboost_grid <- grid_regular(trees())

set.seed(456)

doParallel::registerDoParallel()

xgboost_res <- tune_grid(
  xgb_wf,
  resamples = cars_folds,
  grid = xgboost_grid,
  metrics = metric_set(rmse, rsq, mae),
  control = control_grid(save_pred = TRUE)
)

params_xgboost_best <- xgboost_res %>% select_best("mae")

final_xgboost <- finalize_workflow(xgb_wf, params_xgboost_best)

final_res <- last_fit(final_xgboost, cars_split)

v <- final_res %>%
  extract_workflow() %>%
  vetiver_model(model_name = "cars-xgb")
#> Error in xgb.get.handle(model): invalid xgb.Booster.handle

#Error in xgb.get.handle(model) : invalid xgb.Booster.handle
juliasilge commented 2 years ago

Hmmm, I can't reproduce that either:

library(tidymodels)

set.seed(123)
cars_split <- initial_split(mtcars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)

set.seed(234)
cars_folds <- vfold_cv(cars_train, v = 5)

cars_rec <- recipe(mpg ~ ., data = cars_train) %>%
  step_normalize(all_predictors())
xgb_spec <- boost_tree(mode = "regression", trees = tune())
xgb_wf <- workflow(cars_rec, xgb_spec)

xgboost_grid <- grid_regular(trees())

doParallel::registerDoParallel()
set.seed(456)
xgboost_res <- tune_grid(
  xgb_wf,
  resamples = cars_folds,
  grid = xgboost_grid,
  metrics = metric_set(rmse, rsq, mae),
  control = control_grid(save_pred = TRUE)
)

params_xgboost_best <- xgboost_res %>% select_best("mae")
final_xgboost <- finalize_workflow(xgb_wf, params_xgboost_best)
final_res <- last_fit(final_xgboost, cars_split)

library(vetiver)
#> 
#> Attaching package: 'vetiver'
#> The following object is masked from 'package:tune':
#> 
#>     load_pkgs

v <- final_res %>%
  extract_workflow() %>%
  vetiver_model(model_name = "cars-xgb")

v
#> 
#> ── cars-xgb ─ <bundled_workflow> model for deployment 
#> A xgboost regression modeling workflow using 10 features

Created on 2022-10-17 with reprex v2.0.2

Are you trying to use doParallel::registerDoParallel() on Windows perhaps? You need to use PSOCK clusters on Windows instead. Have you tried updating all your package versions?

datadavidz commented 2 years ago

Yes, I am using Windows. I tried the PSOCK clusters but was getting the same error. I updated all my packages and no error! Thank you and sorry about spending your time on this.