xgbtree for poisson regression with offset

spedygiorgio commented 8 years ago

I am not sure if it is possible to adapt train function's syntax to take into account:

offset for xgboost;
early stopping on watchlist;
customized loss functions.

The following is a reproducible example:

rm(list=ls())
library(xgboost)
library(dplyr)
library(caret)
library(insuranceData) # has example dataset
set.seed(123)
options(contrasts=c("contr.treatment","contr.treatment"))
#loading the data set
data("auto")
data(dataCar)
#fixings

myDb<-mutate(dataCar,
             logExposure=log(exposure),
             X_OBSTAT_=NULL,
             veh_age=factor(veh_age, ordered=TRUE),
             agecat=factor(agecat, ordered=FALSE),
             clm=NULL
             #veh_valueCut=Hmisc::cut2(veh_value,g = 5)
             )

#preparing data for xgboost (one hot encoding of categorical data)
dummyFier<-dummyVars(~veh_body+veh_age+gender+area+agecat,data=myDb,fullRank = TRUE)
dummyVars.df<-predict(dummyFier,newdata = myDb)
myDb<-cbind(myDb,dummyVars.df)

trainIndex <- createDataPartition(myDb$claimcst0, p = .80,list = FALSE,times = 1)

myDb4xgboost<-mutate(myDb,
                     exposure=NULL,
                     claimcst0=NULL,
                     veh_body=NULL,
                     veh_age=NULL, 
                     gender=NULL,
                     area=NULL, 
                     agecat=NULL)

temp <- myDb4xgboost[trainIndex, ]
trainIndex2 <- createDataPartition(temp$numclaims, p = .9,list = FALSE,times = 1)
train_auto<-temp[trainIndex2, ]
wlist_auto<-temp[-trainIndex2, ]
rm(temp)

test_auto <- myDb4xgboost[-trainIndex, ]
testFull<-myDb[-trainIndex,]

#creating watchlist

xgbMatrixTrain <- xgb.DMatrix(as.matrix(train_auto[,-c(2,3)]), label = train_auto$numclaims)
xgbMatrixWlist <- xgb.DMatrix(as.matrix(wlist_auto[,-c(2,3)]), label = wlist_auto$numclaims)
xgbMatrixTest <- xgb.DMatrix(as.matrix(test_auto[,-c(2,3)]), label = test_auto$numclaims)

#fist issue: specifying a base margin to the matrices

setinfo(xgbMatrixTrain,"base_margin",train_auto$logExposure)
setinfo(xgbMatrixTest,"base_margin",test_auto$logExposure)
setinfo(xgbMatrixWlist,"base_margin",wlist_auto$logExposure)

# build Gini functions for use in custom xgboost evaluation metric
# SumModelGini <- function(solution, submission) {
#   df = data.frame(solution = solution, submission = submission)
#   df <- df[order(df$submission, decreasing = TRUE),]
#   df$random = (1:nrow(df))/nrow(df)
#   totalPos <- sum(df$solution)
#   df$cumPosFound <- cumsum(df$solution) # this will store the cumulative number of positive examples found (used for computing "Model Lorentz")
#   df$Lorentz <- df$cumPosFound / totalPos # this will store the cumulative proportion of positive examples found ("Model Lorentz")
#   df$Gini <- df$Lorentz - df$random # will store Lorentz minus random
#   return(sum(df$Gini))
# }
# 
# NormalizedGini <- function(solution, submission) {
#   SumModelGini(solution, submission) / SumModelGini(solution, solution)
# }
# 
# # wrap up into a function to be called within xgboost.train
# evalgini <- function(preds, dtrain) {
#   labels <- getinfo(dtrain, "label")
#   err <- NormalizedGini(as.numeric(labels),as.numeric(preds))
#   return(list(metric = "Gini", value = err))
# }

#second issue: giving a watchlist

myWatch <- list(val=xgbMatrixWlist, 
                  train=xgbMatrixTrain)

bst <- xgb.train(params = list(
  booster='gbtree',
  objective='count:poisson',
  eta=0.01,
  gamma=0.001,
  max_depth = 12,
  subsample = 0.8,
  min_child_weight=1,
  colsample_bytree = 0.8
), 
          data = xgbMatrixTrain, 
         # feval = evalgini, 
          nround=5000, 
          print.every.n = 200, 
          watchlist=myWatch, 
          early.stop.round = 10, 
          maximize = FALSE)

testFull$predNc<-predict(bst, xgbMatrixTest)

with(testFull, sum(numclaims)) 
with(testFull, sum(predNc))

I would like to perform a grid search with caret's train function. But I am not sure how to set the base margin and the watchlist.

topepo commented 7 years ago

You would most likely have to write a custom method. The current code sets the objective function. I can write code around that so that you can pass it in (and I'll intercept it in the ... if you do).

The main issue with xgboost is that it doesn't have a single monolithic function call so that we can like ... to it. This is great from a general programming perspective (and my stuff is moving in that direction too) but it makes it hard to have a single path (the ...) to insert options.

SimonCoulombe commented 6 years ago

I am not sure if something change since last year, but I can definitely pass offsets to xgboost. I just feed the xgbMatrixTrain to caret::train after setting the "base_margin" and I get the same prediction that I get from xgb.train.

I haven't found a way to pass the early_stopping_rounds and having Max say that is is a pain in this issue means that I probably won't succeed.. :)


library(xgboost)
library(dplyr)
library(caret)
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf
library(pdp) # partial dependency plots
library(lime) # Local Interpretable Model-Agnostic Explanations
set.seed(123)
data(dataCar)
mydb <- dataCar %>% select(clm, exposure, veh_value, veh_body,
                           veh_age, gender, area, agecat)

label_var <- "clm"  
offset_var <- "exposure"
feature_vars <- mydb %>% 
  select(-one_of(c(label_var, offset_var))) %>% 
  colnames()

#preparing data for xgboost (one hot encoding of categorical (factor) data
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula()
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE)
dummyVars.df <- predict(dummyFier,newdata = mydb)
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))), 
                    dummyVars.df)
rm(myformula, dummyFier, dummyVars.df)

# create 20% test data
# split the remaining 80% into 72% train and 8% watchlist
trainIndex <- caret::createDataPartition(
  mydb_dummy %>% pull(label_var), 
  p = .80, list = FALSE, times = 1)

temp <- mydb_dummy[trainIndex, ]
trainIndex2 <- caret::createDataPartition(
  temp %>% pull(label_var), 
  p = .90, list = FALSE, times = 1)
train_dummy <-temp[trainIndex2, ]
wlist_dummy <-temp[-trainIndex2, ]
rm(temp)

test_dummy <- mydb_dummy[-trainIndex, ]
test <- mydb[-trainIndex, ]

feature_vars_dummy <-  train_dummy  %>% select(-one_of(c(label_var, offset_var))) %>% colnames()

xgbMatrixTrain <- xgb.DMatrix(
  data = train_dummy %>% select(feature_vars_dummy) %>% as.matrix, 
  label = train_dummy %>% pull(label_var),
  missing = "NAN")

xgbMatrixWlist <- xgb.DMatrix(
  data = wlist_dummy %>% select(feature_vars_dummy) %>% as.matrix, 
  label = wlist_dummy %>% pull(label_var),
  missing = "NAN")

xgbMatrixTest <- xgb.DMatrix(
  data = test_dummy %>% select(feature_vars_dummy) %>% as.matrix, 
  label = test_dummy %>% pull(label_var),
  missing = "NAN")

#first issue with caret: specifying a base margin to the matrices
#base_margin: base margin is the base prediction Xgboost will boost from 
setinfo(xgbMatrixTrain,"base_margin", train_dummy %>% pull(offset_var) %>% log() )
setinfo(xgbMatrixTest,"base_margin", test_dummy %>% pull(offset_var) %>% log() )
setinfo(xgbMatrixWlist,"base_margin", wlist_dummy %>% pull(offset_var) %>% log() )

#second issue with caret: giving a watchlist
myWatch <- list(wlist=xgbMatrixWlist, 
                train=xgbMatrixTrain)

myConstraint   <- data_frame(Variable = feature_vars_dummy) %>%
  mutate(direction = ifelse(Variable == "veh_value", 1, 0))

myParam <- list(
  eta = .01,
  max.depth = 5,
  gamma = 0.001,
  colsample_bytree = 0.8,
  min_child_weight=1,
  subsample = 0.8)

myCallback <- list(
  cb.early.stop(
    metric_name = "wlist_poisson_nloglik", 
    stopping_rounds = 10),
  cb.print.evaluation(period = 20))

start <- Sys.time()
booster <- xgb.train(
  params = myParam, 
  data = xgbMatrixTrain, 
  nround = 100,
  objective = 'count:poisson',
  eval_metric = "poisson-nloglik",
  monotone_constraints= myConstraint$direction)
#watchlist=myWatch, 
#callbacks = myCallback)
duree <- Sys.time() - start

test$pred <- predict(booster, newdata= xgbMatrixTest)

#k-fold cross validation:
trControl = caret::trainControl(
  method = 'cv',
  number = 2,
  verboseIter = TRUE,
  allowParallel = TRUE)

# create the tuning grid.
tuneGridXGB <- expand.grid(
  nrounds=c(100),
  eta = c(0.01),
  max_depth = c(5),
  gamma = c(0.001),
  colsample_bytree = c(0.8),
  min_child_weight = c(1),
  subsample = c(0.8 ))

# train the xgboost learner
start <- Sys.time()
xgbmod <- caret::train(
  x = xgbMatrixTrain,
  y = train_dummy %>% pull(label_var),
  method = 'xgbTree',
  objective = 'count:poisson',
  eval_metric = "poisson-nloglik",
  monotone_constraints= myConstraint$direction,
  trControl = trControl,
  tuneGrid = tuneGridXGB)
print(Sys.time() - start)

booster_caret <- xgbmod$finalModel

test$pred_caret <- predict(booster_caret, newdata= xgbMatrixTest)
plot(test$pred, test$pred_caret)

SimonCoulombe commented 6 years ago

an issue: the out-of-fold predictions generated by adding savePredictions= TRUE to trControl do not appear to take the offset variable into account.

 trControl = caret::trainControl(
    method = 'cv',
    number = 10,
    verboseIter = TRUE,
    allowParallel = TRUE, 
    savePredictions=TRUE)

I then extract the out-of-fold predictions:

train_dummy$out_of_fold_pred <- xgbmod$pred %>% arrange(rowIndex) %>% pull(pred)
train_dummy$pred_caret <- predict(booster_caret, newdata =xgbMatrixTrain)
plot(train_dummy$pred_caret,train_dummy$out_of_fold_pred )

While pred_caret values are anywhere between 0 and 0.7, the out_of_fold_pred are all between 0.32 and 040. This is probably due to the out of fold predictions not taking the offset into account. In my model, most of the variance in the predicted values seems to come from the offset value (exposure).

topepo / caret

xgbtree for poisson regression with offset #507