ModelOriented / treeshap

Compute SHAP values for your tree-based models using the TreeSHAP algorithm
https://modeloriented.github.io/treeshap/
GNU General Public License v3.0
78 stars 23 forks source link

lightgbm.unify example erroring. #17

Closed nspyrison closed 1 year ago

nspyrison commented 2 years ago

Following example in ?lightgbm.unify

library(treeshap)

library(lightgbm)
#> Loading required package: R6
param_lgbm <- list(objective = "regression", max_depth = 2,  force_row_wise = TRUE)
data_fifa <- fifa20$data[!colnames(fifa20$data) %in%
                           c('work_rate', 'value_eur', 'gk_diving', 'gk_handling',
                             'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning')]
data <- na.omit(cbind(data_fifa, fifa20$target))
sparse_data <- as.matrix(data[,-ncol(data)])
x <- lightgbm::lgb.Dataset(sparse_data, label = as.matrix(data[,ncol(data)]))
lgb_data <- lightgbm::lgb.Dataset.construct(x)
lgb_model <- lightgbm::lightgbm(data = lgb_data, params = param_lgbm, save_name = "", verbose = 0)
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
#> Error in bst$save_model(filename = save_name): Model file  is not available for writes
# unified_model <- lightgbm.unify(lgb_model, sparse_data)
# shaps <- treeshap(unified_model, data[1:2, ])
# plot_contribution(shaps, obs = 1)
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19042)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] lightgbm_3.3.1 R6_2.5.1       treeshap_0.1.1
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.7        pillar_1.6.4      compiler_4.1.2    highr_0.9        
#>  [5] R.methodsS3_1.8.1 R.utils_2.11.0    tools_4.1.2       digest_0.6.29    
#>  [9] jsonlite_1.7.2    lattice_0.20-44   evaluate_0.14     lifecycle_1.0.1  
#> [13] tibble_3.1.2      gtable_0.3.0      R.cache_0.15.0    pkgconfig_2.0.3  
#> [17] rlang_0.4.10      Matrix_1.3-4      reprex_2.0.1      DBI_1.1.2        
#> [21] yaml_2.2.1        xfun_0.29         fastmap_1.1.0     dplyr_1.0.7      
#> [25] withr_2.4.3       styler_1.6.2      stringr_1.4.0     knitr_1.37       
#> [29] generics_0.1.1    fs_1.5.2          vctrs_0.3.8       tidyselect_1.1.1 
#> [33] grid_4.1.2        glue_1.4.2        data.table_1.14.2 fansi_0.4.2      
#> [37] rmarkdown_2.11    purrr_0.3.4       ggplot2_3.3.5     magrittr_2.0.1   
#> [41] backports_1.4.1   scales_1.1.1      ellipsis_0.3.2    htmltools_0.5.2  
#> [45] assertthat_0.2.1  colorspace_2.0-2  utf8_1.1.4        stringi_1.5.3    
#> [49] munsell_0.5.0     crayon_1.4.2      R.oo_1.24.0

Created on 2022-01-03 by the reprex package (v2.0.1)

nspyrison commented 2 years ago

Working example combining treeshap and lightgbm examples, maybe something like:

library(treeshap)
library(lightgbm)
#> Loading required package: R6

param_lgbm <- list(objective = "regression", max_depth = 2,  force_row_wise = TRUE)
data_fifa  <- fifa20$data[!colnames(fifa20$data) %in%
                            c('work_rate', 'value_eur', 'gk_diving', 'gk_handling',
                              'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning')]

## lightgbm on sparse matrix:
lgbm_params <- list(
  num_leaves = 4L, learning_rate = 1.0, objective = "binary", nthread = 2L)
fit <- lightgbm(data = as.matrix(data_fifa), params = lgbm_params,
                label = fifa20$target, nrounds = 2L)
#> [LightGBM] [Info] Number of positive: 18028, number of negative: 250
#> [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004856 seconds.
#> You can set `force_col_wise=true` to remove the overhead.
#> [LightGBM] [Info] Total Bins 3441
#> [LightGBM] [Info] Number of data points in the train set: 18278, number of used features: 48
#> [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.986322 -> initscore=4.278220
#> [LightGBM] [Info] Start training from score 4.278220
#> [1] "[1]:  train's binary_logloss:0.100651"
#> [1] "[2]:  train's binary_logloss:0.0992022"

## lightgbm on dense matrix:
dat <- na.omit(cbind(data_fifa, fifa20$target))
fit <- lightgbm(data = as.matrix(dat[, -ncol(dat)]), params = lgbm_params,
                label = dat[, ncol(dat)], nrounds = 2L)
#> [LightGBM] [Info] Number of positive: 16032, number of negative: 210
#> [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 2956
#> [LightGBM] [Info] Number of data points in the train set: 16242, number of used features: 48
#> [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.987071 -> initscore=4.335234
#> [LightGBM] [Info] Start training from score 4.335234
#> [1] "[1]:  train's binary_logloss:0.0975157"
#> [1] "[2]:  train's binary_logloss:0.0976132"

Created on 2022-01-05 by the reprex package (v2.0.1)

krzyzinskim commented 1 year ago

Thanks! I see that it was already fixed in #25