train() failed for repeatedcv with factor predictors

Consider the following adapted cforest model (from the package partykit):

cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
              library = c("partykit", "party"),
              loop = NULL,
              type = c("Classification", "Regression"),
              parameters = data.frame(parameter = 'mtry',
                                      class = 'numeric',
                                      label = "#Randomly Selected Predictors"),
              grid = function(x, y, len = NULL, search = "grid"){
                if(search == "grid") {
                  out <- data.frame(mtry = caret::var_seq(p = ncol(x), 
                                                          classification = is.factor(y), 
                                                          len = len))
                } else {
                  out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
                }
                out
              },
              fit = function(x, y, wts, param, lev, last, classProbs, ...) {

                # make consistent factor levels
                if(any(sapply(x, is.factor))){
                  fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
                  # assign present levels to each subset
                  for (i in 1:length(fac_col_names)) {
                    x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])], 
                                                                       levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
                  }
                }

                dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
                dat$.outcome <- y
                theDots <- list(...)

                if(any(names(theDots) == "mtry")) # # change controls to mtry?
                {
                  theDots$mtry <- as.integer(param$mtry) # remove gtcrl 
                  theDots$mtry
                  theDots$mtry <- NULL

                } else mtry <- min(param$mtry, ncol(x))

                ## pass in any model weights
                if(!is.null(wts)) theDots$weights <- wts

                modelArgs <- c(list(formula = as.formula(.outcome ~ .),
                                    data = dat,
                                    mtry = mtry), # change controls to mtry?
                               theDots)

                out <- do.call(partykit::cforest, modelArgs)
                out
              },
              predict = function(modelFit, newdata = NULL, submodels = NULL) {
                if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)

                # make consistent factor levels
                if(any(sapply(newdata, is.factor))){
                  fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
                  # assign present levels to each subset
                  for (i in 1:length(fac_col_names)) {
                    newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])], 
                                                                       levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
                  }
                }

                ## party builds the levels into the model object, so I'm
                ## going to assume that all the levels will be passed to
                ## the output
                out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
                if(is.matrix(out)) out <- out[,1]
                if(!is.null(modelFit$'(response)')) out <- as.character(out) #  if(!is.null(modelFit@responses@levels$.outcome)) out <- as.character(out)

                out
              },
              prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
                if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
                obsLevels <- levels(modelFit$'(response)')
                rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
                probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
                out <- data.frame(probMatrix)
                colnames(out) <- obsLevels
                rownames(out) <- NULL
                out
              },
              predictors = function(x, ...) {
                vi <- partykit::varimp(x, ...)
                names(vi)[vi != 0]
              },
              varImp = function(object, ...) {
                variableImp <- partykit::varimp(object, ...)
                out <- data.frame(Overall = variableImp)
                out
              },
              tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
              levels = function(x) levels(x@data@get("response")[,1]),
              sort = function(x) x[order(x[,1]),],
              oob = function(x) {
                obs <- x@data@get("response")[,1]
                pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
                postResample(pred, obs)
              })

This function shall now be used with Caret's train function. Without any factor variables or without cross-validation it works fine. The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:

library(caret)
library(party)
library(partykit)

dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))

# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
                                       k = 3,   
                                       times = 2)

# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2, 
                                    index = folds_train, 
                                    savePred = T)

preds <- dat[,2:5]
response <- dat[,1]

# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4)) 
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
                      y = response, # response
                      method = cforest_partykit,
                      metric = "RMSE", 
                      tuneGrid = tunegrid, 
                      trControl = finalcontrol,
                      ntree = 150)

warnings() 1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t

The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:

for (i in 1:length(folds_train)) {

  preds_temp <- preds[folds_train[[i]],]
  # check levels 
  levels(preds_temp$class)
  # which are actually present
  unique(preds_temp$class)
  # assign present levels to each subset
  preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))

}

I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.

How could I implement this in the caret train() or trainControl() or createDataPartition() function?

topepo / caret

train() failed for repeatedcv with factor predictors #1240