Closed spedygiorgio closed 7 years ago
You would most likely have to write a custom method. The current code sets the objective function. I can write code around that so that you can pass it in (and I'll intercept it in the ...
if you do).
The main issue with xgboost
is that it doesn't have a single monolithic function call so that we can like ...
to it. This is great from a general programming perspective (and my stuff is moving in that direction too) but it makes it hard to have a single path (the ...
) to insert options.
I am not sure if something change since last year, but I can definitely pass offsets to xgboost. I just feed the xgbMatrixTrain to caret::train after setting the "base_margin" and I get the same prediction that I get from xgb.train.
I haven't found a way to pass the early_stopping_rounds and having Max say that is is a pain in this issue means that I probably won't succeed.. :)
library(xgboost)
library(dplyr)
library(caret)
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf
library(pdp) # partial dependency plots
library(lime) # Local Interpretable Model-Agnostic Explanations
set.seed(123)
data(dataCar)
mydb <- dataCar %>% select(clm, exposure, veh_value, veh_body,
veh_age, gender, area, agecat)
label_var <- "clm"
offset_var <- "exposure"
feature_vars <- mydb %>%
select(-one_of(c(label_var, offset_var))) %>%
colnames()
#preparing data for xgboost (one hot encoding of categorical (factor) data
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula()
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE)
dummyVars.df <- predict(dummyFier,newdata = mydb)
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))),
dummyVars.df)
rm(myformula, dummyFier, dummyVars.df)
# create 20% test data
# split the remaining 80% into 72% train and 8% watchlist
trainIndex <- caret::createDataPartition(
mydb_dummy %>% pull(label_var),
p = .80, list = FALSE, times = 1)
temp <- mydb_dummy[trainIndex, ]
trainIndex2 <- caret::createDataPartition(
temp %>% pull(label_var),
p = .90, list = FALSE, times = 1)
train_dummy <-temp[trainIndex2, ]
wlist_dummy <-temp[-trainIndex2, ]
rm(temp)
test_dummy <- mydb_dummy[-trainIndex, ]
test <- mydb[-trainIndex, ]
feature_vars_dummy <- train_dummy %>% select(-one_of(c(label_var, offset_var))) %>% colnames()
xgbMatrixTrain <- xgb.DMatrix(
data = train_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = train_dummy %>% pull(label_var),
missing = "NAN")
xgbMatrixWlist <- xgb.DMatrix(
data = wlist_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = wlist_dummy %>% pull(label_var),
missing = "NAN")
xgbMatrixTest <- xgb.DMatrix(
data = test_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = test_dummy %>% pull(label_var),
missing = "NAN")
#first issue with caret: specifying a base margin to the matrices
#base_margin: base margin is the base prediction Xgboost will boost from
setinfo(xgbMatrixTrain,"base_margin", train_dummy %>% pull(offset_var) %>% log() )
setinfo(xgbMatrixTest,"base_margin", test_dummy %>% pull(offset_var) %>% log() )
setinfo(xgbMatrixWlist,"base_margin", wlist_dummy %>% pull(offset_var) %>% log() )
#second issue with caret: giving a watchlist
myWatch <- list(wlist=xgbMatrixWlist,
train=xgbMatrixTrain)
myConstraint <- data_frame(Variable = feature_vars_dummy) %>%
mutate(direction = ifelse(Variable == "veh_value", 1, 0))
myParam <- list(
eta = .01,
max.depth = 5,
gamma = 0.001,
colsample_bytree = 0.8,
min_child_weight=1,
subsample = 0.8)
myCallback <- list(
cb.early.stop(
metric_name = "wlist_poisson_nloglik",
stopping_rounds = 10),
cb.print.evaluation(period = 20))
start <- Sys.time()
booster <- xgb.train(
params = myParam,
data = xgbMatrixTrain,
nround = 100,
objective = 'count:poisson',
eval_metric = "poisson-nloglik",
monotone_constraints= myConstraint$direction)
#watchlist=myWatch,
#callbacks = myCallback)
duree <- Sys.time() - start
test$pred <- predict(booster, newdata= xgbMatrixTest)
#k-fold cross validation:
trControl = caret::trainControl(
method = 'cv',
number = 2,
verboseIter = TRUE,
allowParallel = TRUE)
# create the tuning grid.
tuneGridXGB <- expand.grid(
nrounds=c(100),
eta = c(0.01),
max_depth = c(5),
gamma = c(0.001),
colsample_bytree = c(0.8),
min_child_weight = c(1),
subsample = c(0.8 ))
# train the xgboost learner
start <- Sys.time()
xgbmod <- caret::train(
x = xgbMatrixTrain,
y = train_dummy %>% pull(label_var),
method = 'xgbTree',
objective = 'count:poisson',
eval_metric = "poisson-nloglik",
monotone_constraints= myConstraint$direction,
trControl = trControl,
tuneGrid = tuneGridXGB)
print(Sys.time() - start)
booster_caret <- xgbmod$finalModel
test$pred_caret <- predict(booster_caret, newdata= xgbMatrixTest)
plot(test$pred, test$pred_caret)
an issue: the out-of-fold predictions generated by adding savePredictions= TRUE to trControl do not appear to take the offset variable into account.
trControl = caret::trainControl(
method = 'cv',
number = 10,
verboseIter = TRUE,
allowParallel = TRUE,
savePredictions=TRUE)
I then extract the out-of-fold predictions:
train_dummy$out_of_fold_pred <- xgbmod$pred %>% arrange(rowIndex) %>% pull(pred)
train_dummy$pred_caret <- predict(booster_caret, newdata =xgbMatrixTrain)
plot(train_dummy$pred_caret,train_dummy$out_of_fold_pred )
While pred_caret values are anywhere between 0 and 0.7, the out_of_fold_pred are all between 0.32 and 040. This is probably due to the out of fold predictions not taking the offset into account. In my model, most of the variance in the predicted values seems to come from the offset value (exposure).
I am not sure if it is possible to adapt train function's syntax to take into account:
The following is a reproducible example:
I would like to perform a grid search with caret's train function. But I am not sure how to set the base margin and the watchlist.