topepo / caret

caret (Classification And Regression Training) R package that contains misc functions for training and plotting classification and regression models
http://topepo.github.io/caret/index.html
1.62k stars 632 forks source link

train() failed for repeatedcv with factor predictors #1240

Open yPennylane opened 3 years ago

yPennylane commented 3 years ago

Consider the following adapted cforest model (from the package partykit):

cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
              library = c("partykit", "party"),
              loop = NULL,
              type = c("Classification", "Regression"),
              parameters = data.frame(parameter = 'mtry',
                                      class = 'numeric',
                                      label = "#Randomly Selected Predictors"),
              grid = function(x, y, len = NULL, search = "grid"){
                if(search == "grid") {
                  out <- data.frame(mtry = caret::var_seq(p = ncol(x), 
                                                          classification = is.factor(y), 
                                                          len = len))
                } else {
                  out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
                }
                out
              },
              fit = function(x, y, wts, param, lev, last, classProbs, ...) {

                # make consistent factor levels
                if(any(sapply(x, is.factor))){
                  fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
                  # assign present levels to each subset
                  for (i in 1:length(fac_col_names)) {
                    x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])], 
                                                                       levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
                  }
                }

                dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
                dat$.outcome <- y
                theDots <- list(...)

                if(any(names(theDots) == "mtry")) # # change controls to mtry?
                {
                  theDots$mtry <- as.integer(param$mtry) # remove gtcrl 
                  theDots$mtry
                  theDots$mtry <- NULL

                } else mtry <- min(param$mtry, ncol(x))

                ## pass in any model weights
                if(!is.null(wts)) theDots$weights <- wts

                modelArgs <- c(list(formula = as.formula(.outcome ~ .),
                                    data = dat,
                                    mtry = mtry), # change controls to mtry?
                               theDots)

                out <- do.call(partykit::cforest, modelArgs)
                out
              },
              predict = function(modelFit, newdata = NULL, submodels = NULL) {
                if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)

                # make consistent factor levels
                if(any(sapply(newdata, is.factor))){
                  fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
                  # assign present levels to each subset
                  for (i in 1:length(fac_col_names)) {
                    newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])], 
                                                                       levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
                  }
                }

                ## party builds the levels into the model object, so I'm
                ## going to assume that all the levels will be passed to
                ## the output
                out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
                if(is.matrix(out)) out <- out[,1]
                if(!is.null(modelFit$'(response)')) out <- as.character(out) #  if(!is.null(modelFit@responses@levels$.outcome)) out <- as.character(out)

                out
              },
              prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
                if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
                obsLevels <- levels(modelFit$'(response)')
                rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
                probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
                out <- data.frame(probMatrix)
                colnames(out) <- obsLevels
                rownames(out) <- NULL
                out
              },
              predictors = function(x, ...) {
                vi <- partykit::varimp(x, ...)
                names(vi)[vi != 0]
              },
              varImp = function(object, ...) {
                variableImp <- partykit::varimp(object, ...)
                out <- data.frame(Overall = variableImp)
                out
              },
              tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
              levels = function(x) levels(x@data@get("response")[,1]),
              sort = function(x) x[order(x[,1]),],
              oob = function(x) {
                obs <- x@data@get("response")[,1]
                pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
                postResample(pred, obs)
              })

This function shall now be used with Caret's train function. Without any factor variables or without cross-validation it works fine. The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:

library(caret)
library(party)
library(partykit)

dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))

# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
                                       k = 3,   
                                       times = 2)

# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2, 
                                    index = folds_train, 
                                    savePred = T)

preds <- dat[,2:5]
response <- dat[,1]

# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4)) 
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
                      y = response, # response
                      method = cforest_partykit,
                      metric = "RMSE", 
                      tuneGrid = tunegrid, 
                      trControl = finalcontrol,
                      ntree = 150)

warnings() 1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t

The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:

for (i in 1:length(folds_train)) {

  preds_temp <- preds[folds_train[[i]],]
  # check levels 
  levels(preds_temp$class)
  # which are actually present
  unique(preds_temp$class)
  # assign present levels to each subset
  preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))

}

I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.

How could I implement this in the caret train() or trainControl() or createDataPartition() function?

topepo commented 3 years ago

caret doesn't do anything to the factor levels (I went back and verified), so there's not much that I can do but track down the change:

The issue is within partykit. cforest() parses the data by passing it to ctree(), which passes it to extree_data(). That function has this line, which redefines the factor levels to only include those that were observed in the existing data.

When new data are given to predict() it errors out during this code block because of the difference.

The authors of that package are very sharp and probably had a good reason for this. I would suggest contacting them to ask for advice. The maintainer has been very diligent in the past about issues (much better than I am).