There is a failure in Stacked Ensemble when you use either an XGBoost or Naive Bayes classifier during multinomial classification. Unfortunately, there is a lack of test coverage for these two cases, so we need to update the SE multinomial unit tests (R/Py):

Reported on Stack Overflow. https://stackoverflow.com/questions/51606637/error-in-h2o-h2o-stackedensemble-dont-know-how-to-determine-the-distribution

Here's the error: {code} Error: water.exceptions.H2OIllegalArgumentException: water.exceptions.H2OIllegalArgumentException: Don't know how to determine the distribution for a multinomial classifier. {code}

Repro in R here:

{code} #######################################################################

Minimum reproducible example for Stackoverflow

#######################################################################

R version: 3.4.4 (2018-03-15)

H2O cluster version: 3.21.0.4376

OS: Linux (Azure Data Science VM)

Installing and loading necessary libraries

cat("\n Installing and loading necessary libraries \n") libsNeeded <- c("dplyr", "data.table", "randomForest", "stringr","doParallel", "parallel", "doSNOW", "rlang", "nlme", "MASS", "survival", "stringi", "dummies", "missRanger","cluster", "e1071","xgboost","ranger", "caret") if(length(setdiff(libsNeeded, rownames(installed.packages()))) > 0){ install.packages(setdiff(libsNeeded, rownames(installed.packages()))) } lapply(libsNeeded, require, character.only = T)

Installing latest H2O if not done already:

install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")))

library(h2o)

Starting an H2O cluster

h2o.init(max_mem_size = "23g")

library(CARS) dataFrame <- Cars93

Removing rows where Passengers = 8 or 7 or 2, as their occurence frequency is low and for demonstration purpose, want to avoid errors coming because of this reason

dataFrame <- dataFrame[!(dataFrame$Passengers %in% c("2", "7", "8")),]

Making the dependent variable as factor

dataFrame$Passengers <- as.factor(dataFrame$Passengers)

Defining the variables to be used in modeling

depVars <- "Passengers" indepNumVars <- c("Price","MPG.highway","EngineSize","Horsepower") indepFactVars <- c("AirBags","Type")

Keeping only columns of interest

dataFrame <- dataFrame[,c(indepFactVars,indepNumVars,depVars)]

Converting dependent variables into dummy variables:

dataFrame <- dummy.data.frame(dataFrame, names=colnames(dataFrame[,indepFactVars]), sep="") names(dataFrame) <- gsub(" ", "", names(dataFrame))

Creating the train and test datasets

trainIndex <- createDataPartition(dataFrame[,depVars], times = 1, p = 0.75) trainingData <- dataFrame[trainIndex$Resample1,] testingData <- dataFrame[-trainIndex$Resample1,]

H2O Frames

train <- as.h2o(trainingData) test <- as.h2o(testingData)

Perform PCA

depData <- train[, depVars]

train <- train[, setdiff(names(train), c(depVars))]

pca_model <- h2o.prcomp(training_frame = train, model_id = NULL, ignore_const_cols = TRUE, transform = "STANDARDIZE", pca_method = "GramSVD", k = 10, max_iterations = 5000, seed = -1, score_each_iteration = TRUE, use_all_factor_levels = FALSE, compute_metrics = TRUE, max_runtime_secs = 0, impute_missing = T)

cum_prop <- pca_model@model$model_summary["Cumulative Proportion", ]

print(cum_prop)

cum_prop_to_consider <- length(cum_prop[cum_prop < .95]) + 1

cat("\n\n Number of principal components that explain 95% variance = ",cum_prop_to_consider,"\n\n")

trainPCA <- h2o.predict(pca_model, train) if(cum_prop_to_consider > ncol(trainPCA)){ trainPCA <- trainPCA[, 1:(cum_prop_to_consider - 1)] }else{ trainPCA <- trainPCA[, 1:cum_prop_to_consider] }

pca_data <- as.data.table(pca_data)

trainPCA[, depVars] <- depData[, depVars]

Preparing the test data:

testPCA <- h2o.predict(pca_model,test) if(cum_prop_to_consider > ncol(testPCA)){ testPCA <- testPCA[, 1:(cum_prop_to_consider - 1)] }else{ testPCA <- testPCA[, 1:cum_prop_to_consider] } testPCA[, depVars] <- test[, depVars]

For binary classification, response should be a factor

trainPCA[,depVars] <- as.factor(trainPCA[,depVars]) testPCA[,depVars] <- as.factor(test[,depVars])

Weights of the training data:

trainPCA$weightage <- ifelse(trainPCA[,depVars] == "5", 1, ifelse(trainPCA[,depVars] == "4", 2, ifelse(trainPCA[,depVars] == "6", 2,1)))

Number of CV folds (to generate level-one data for stacking)

nfolds <- 5

####################################################################################################

Stacked Ensemble modeling

####################################################################################################

modelIteration <- Sys.Date() modelIteration <- gsub("-", "_", modelIteration) i = "withInsp"

Train & Cross-validate a RF

ModelOneRF <- h2o.randomForest(x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, ntrees = 15, nfolds = nfolds, fold_assignment = "Stratified", max_depth = 30, min_rows = 1, mtries = 3, keep_cross_validation_predictions = TRUE, seed = 1,

verbose = T,

                           weights_column = "weightage",
                           model_id = paste0(i,"_ModelOneRF_",modelIteration))

cat("\n\n Mean accuracy of Random Forest Model (on cross validation):",ModelOneRF@model$cross_validation_metrics_summary[1,1],"\n\n") perf_RF <- h2o.performance(model = ModelOneRF, newdata = testPCA) cat("\n\n Accuracy of Random Forest Model (on test data):",1 - perf_RF@metrics$mean_per_class_error,"\n\n")

Train & Cross-validate a XGBoost

ModelTwoXGBoost <- h2o.xgboost(x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified", weights_column = "weightage", ntrees = 15, max_depth = 20, min_rows = 1, learn_rate = 0.1, eta = 0.3, keep_cross_validation_predictions = TRUE, seed = 1,

verbose = T,

                           model_id = paste0(i,"_ModelTwoXGBoost_",modelIteration))

cat("\n\n Mean accuracy of XGBoost Model (on cross validation):",ModelTwoXGBoost@model$cross_validation_metrics_summary[1,1],"\n\n") perf_XGBoost <- h2o.performance(model = ModelTwoXGBoost, newdata = testPCA) cat("\n\n Accuracy of XGBoost Model (on test data):",1 - perf_XGBoost@metrics$mean_per_class_error,"\n\n")

Train and cross validate a Generalized Linear Model (GLM)

ModelThreeGLM <- h2o.glm(family= "multinomial", x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified", weights_column = "weightage", alpha = 0.0, lambda_search = T, standardize = T, seed = 1,

verbose = T,

                     model_id = paste0(i,"_ModelThreeGLM_",modelIteration),
                     keep_cross_validation_predictions = TRUE)

cat("\n\n Mean accuracy of GLM Model (on cross validation):",ModelThreeGLM@model$cross_validation_metrics_summary[1,1],"\n\n") perf_GLM <- h2o.performance(model = ModelThreeGLM, newdata = testPCA) cat("\n\n Accuracy of GLM Model (on test data):",1 - perf_GLM@metrics$mean_per_class_error,"\n\n")

Train and cross validate a Gradient Boosting Machine (GBM)

ModelFourGBM <- h2o.gbm(x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified", weights_column = "weightage", ntrees = 10, max_depth = 20, seed = 1, learn_rate = 0.05, learn_rate_annealing = 0.99,

verbose = T,

                    keep_cross_validation_predictions = TRUE,
                    model_id = paste0(i,"_ModelFourGBM_",modelIteration))

cat("\n\n Mean accuracy of GBM Model (on cross validation):",ModelFourGBM@model$cross_validation_metrics_summary[1,1],"\n\n") perf_GBM <- h2o.performance(model = ModelFourGBM, newdata = testPCA) cat("\n\n Accuracy of GBM Model (on test data):",1 - perf_GBM@metrics$mean_per_class_error,"\n\n")

Train and cross validate a NaÃ¯ve Bayes Model

ModelFiveBayes <- h2o.naiveBayes(x = setdiff(colnames(trainPCA),c(depVars,"weightage")), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified",

weights_column = "weightage",

                             seed = 1,
                             # verbose = T,
                             keep_cross_validation_predictions = TRUE,
                             model_id = paste0(i,"_ModelFiveBayes_",modelIteration))

cat("\n\n Mean accuracy of Naive Bayes Model (on cross validation):",ModelFiveBayes@model$cross_validation_metrics_summary[1,1],"\n\n") perf_Bayes <- h2o.performance(model = ModelFiveBayes, newdata = testPCA) cat("\n\n Accuracy of Naive Bayes Model (on test data):",1 - perf_Bayes@metrics$mean_per_class_error,"\n\n")

ERROR

Train a stacked ensemble using the GBM and RF above

ensemble <- h2o.stackedEnsemble(x = setdiff(colnames(trainPCA),c(depVars,"weightage")), y = depVars, training_frame = trainPCA,

model_id = paste0(i,"ModelEnsemble",modelIteration),

                            model_id = paste0(i,"_ModelEnsemble_2_",modelIteration),
                            base_models = c(ModelOneRF@model_id, ModelTwoXGBoost@model_id,ModelThreeGLM@model_id,ModelFourGBM@model_id,ModelFiveBayes@model_id),
                            metalearner_algorithm = "drf",
                            metalearner_nfolds = nfolds)

WORKS (breaks when you add XGBoost or Naive Bayes)

Train a stacked ensemble using the GBM and RF above

ensemble <- h2o.stackedEnsemble(#x = setdiff(colnames(trainPCA),c(depVars,"weightage")), y = depVars, training_frame = trainPCA,

model_id = paste0(i,"ModelEnsemble",modelIteration),

                            #model_id = paste0(i,"_ModelEnsemble_2_",modelIteration),
                            base_models = c(ModelThreeGLM@model_id, ModelFourGBM@model_id, ModelOneRF@model_id),
                            metalearner_algorithm = "drf",
                            metalearner_nfolds = nfolds)

{code}

h2oai / h2o-3

Multinomial Stacked Ensemble fails with either XGBoost or Naive Bayes base model #12668

Minimum reproducible example for Stackoverflow

R version: 3.4.4 (2018-03-15)

H2O cluster version: 3.21.0.4376

OS: Linux (Azure Data Science VM)

Installing and loading necessary libraries

Installing latest H2O if not done already:

install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")))

Starting an H2O cluster

Removing rows where Passengers = 8 or 7 or 2, as their occurence frequency is low and for demonstration purpose, want to avoid errors coming because of this reason

Making the dependent variable as factor

Defining the variables to be used in modeling

Keeping only columns of interest

Converting dependent variables into dummy variables:

Creating the train and test datasets

H2O Frames

Perform PCA

print(cum_prop)

pca_data <- as.data.table(pca_data)

Preparing the test data:

For binary classification, response should be a factor

Weights of the training data:

Number of CV folds (to generate level-one data for stacking)

Stacked Ensemble modeling

Train & Cross-validate a RF

verbose = T,

Train & Cross-validate a XGBoost

verbose = T,

Train and cross validate a Generalized Linear Model (GLM)

verbose = T,

Train and cross validate a Gradient Boosting Machine (GBM)

verbose = T,

Train and cross validate a NaÃ¯ve Bayes Model

weights_column = "weightage",

ERROR

Train a stacked ensemble using the GBM and RF above

model_id = paste0(i,"ModelEnsemble",modelIteration),

WORKS (breaks when you add XGBoost or Naive Bayes)

Train a stacked ensemble using the GBM and RF above

model_id = paste0(i,"ModelEnsemble",modelIteration),