Closed exalate-issue-sync[bot] closed 1 year ago
JIRA Issue Migration Info
Jira Issue: PUBDEV-5814 Assignee: Erin LeDell Reporter: Erin LeDell State: Closed Fix Version: 3.20.0.5 Attachments: N/A Development PRs: Available
Linked PRs from JIRA
https://github.com/h2oai/h2o-3/pull/2713 https://github.com/h2oai/h2o-3/pull/2714 https://github.com/h2oai/h2o-3/pull/2715
There is a failure in Stacked Ensemble when you use either an XGBoost or Naive Bayes classifier during multinomial classification. Unfortunately, there is a lack of test coverage for these two cases, so we need to update the SE multinomial unit tests (R/Py):
Reported on Stack Overflow. https://stackoverflow.com/questions/51606637/error-in-h2o-h2o-stackedensemble-dont-know-how-to-determine-the-distribution
Here's the error: {code} Error: water.exceptions.H2OIllegalArgumentException: water.exceptions.H2OIllegalArgumentException: Don't know how to determine the distribution for a multinomial classifier. {code}
Repro in R here:
{code} #######################################################################
Minimum reproducible example for Stackoverflow
#######################################################################
R version: 3.4.4 (2018-03-15)
H2O cluster version: 3.21.0.4376
OS: Linux (Azure Data Science VM)
Installing and loading necessary libraries
cat("\n Installing and loading necessary libraries \n") libsNeeded <- c("dplyr", "data.table", "randomForest", "stringr","doParallel", "parallel", "doSNOW", "rlang", "nlme", "MASS", "survival", "stringi", "dummies", "missRanger","cluster", "e1071","xgboost","ranger", "caret") if(length(setdiff(libsNeeded, rownames(installed.packages()))) > 0){ install.packages(setdiff(libsNeeded, rownames(installed.packages()))) } lapply(libsNeeded, require, character.only = T)
Installing latest H2O if not done already:
install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")))
library(h2o)
Starting an H2O cluster
h2o.init(max_mem_size = "23g")
library(CARS) dataFrame <- Cars93
Removing rows where Passengers = 8 or 7 or 2, as their occurence frequency is low and for demonstration purpose, want to avoid errors coming because of this reason
dataFrame <- dataFrame[!(dataFrame$Passengers %in% c("2", "7", "8")),]
Making the dependent variable as factor
dataFrame$Passengers <- as.factor(dataFrame$Passengers)
Defining the variables to be used in modeling
depVars <- "Passengers" indepNumVars <- c("Price","MPG.highway","EngineSize","Horsepower") indepFactVars <- c("AirBags","Type")
Keeping only columns of interest
dataFrame <- dataFrame[,c(indepFactVars,indepNumVars,depVars)]
Converting dependent variables into dummy variables:
dataFrame <- dummy.data.frame(dataFrame, names=colnames(dataFrame[,indepFactVars]), sep="") names(dataFrame) <- gsub(" ", "", names(dataFrame))
Creating the train and test datasets
trainIndex <- createDataPartition(dataFrame[,depVars], times = 1, p = 0.75) trainingData <- dataFrame[trainIndex$Resample1,] testingData <- dataFrame[-trainIndex$Resample1,]
H2O Frames
train <- as.h2o(trainingData) test <- as.h2o(testingData)
Perform PCA
depData <- train[, depVars]
train <- train[, setdiff(names(train), c(depVars))]
pca_model <- h2o.prcomp(training_frame = train, model_id = NULL, ignore_const_cols = TRUE, transform = "STANDARDIZE", pca_method = "GramSVD", k = 10, max_iterations = 5000, seed = -1, score_each_iteration = TRUE, use_all_factor_levels = FALSE, compute_metrics = TRUE, max_runtime_secs = 0, impute_missing = T)
cum_prop <- pca_model@model$model_summary["Cumulative Proportion", ]
print(cum_prop)
cum_prop_to_consider <- length(cum_prop[cum_prop < .95]) + 1
cat("\n\n Number of principal components that explain 95% variance = ",cum_prop_to_consider,"\n\n")
trainPCA <- h2o.predict(pca_model, train) if(cum_prop_to_consider > ncol(trainPCA)){ trainPCA <- trainPCA[, 1:(cum_prop_to_consider - 1)] }else{ trainPCA <- trainPCA[, 1:cum_prop_to_consider] }
pca_data <- as.data.table(pca_data)
trainPCA[, depVars] <- depData[, depVars]
Preparing the test data:
testPCA <- h2o.predict(pca_model,test) if(cum_prop_to_consider > ncol(testPCA)){ testPCA <- testPCA[, 1:(cum_prop_to_consider - 1)] }else{ testPCA <- testPCA[, 1:cum_prop_to_consider] } testPCA[, depVars] <- test[, depVars]
For binary classification, response should be a factor
trainPCA[,depVars] <- as.factor(trainPCA[,depVars]) testPCA[,depVars] <- as.factor(test[,depVars])
Weights of the training data:
trainPCA$weightage <- ifelse(trainPCA[,depVars] == "5", 1, ifelse(trainPCA[,depVars] == "4", 2, ifelse(trainPCA[,depVars] == "6", 2,1)))
Number of CV folds (to generate level-one data for stacking)
nfolds <- 5
####################################################################################################
Stacked Ensemble modeling
####################################################################################################
modelIteration <- Sys.Date() modelIteration <- gsub("-", "_", modelIteration) i = "withInsp"
Train & Cross-validate a RF
ModelOneRF <- h2o.randomForest(x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, ntrees = 15, nfolds = nfolds, fold_assignment = "Stratified", max_depth = 30, min_rows = 1, mtries = 3, keep_cross_validation_predictions = TRUE, seed = 1,
verbose = T,
cat("\n\n Mean accuracy of Random Forest Model (on cross validation):",ModelOneRF@model$cross_validation_metrics_summary[1,1],"\n\n") perf_RF <- h2o.performance(model = ModelOneRF, newdata = testPCA) cat("\n\n Accuracy of Random Forest Model (on test data):",1 - perf_RF@metrics$mean_per_class_error,"\n\n")
Train & Cross-validate a XGBoost
ModelTwoXGBoost <- h2o.xgboost(x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified", weights_column = "weightage", ntrees = 15, max_depth = 20, min_rows = 1, learn_rate = 0.1, eta = 0.3, keep_cross_validation_predictions = TRUE, seed = 1,
verbose = T,
cat("\n\n Mean accuracy of XGBoost Model (on cross validation):",ModelTwoXGBoost@model$cross_validation_metrics_summary[1,1],"\n\n") perf_XGBoost <- h2o.performance(model = ModelTwoXGBoost, newdata = testPCA) cat("\n\n Accuracy of XGBoost Model (on test data):",1 - perf_XGBoost@metrics$mean_per_class_error,"\n\n")
Train and cross validate a Generalized Linear Model (GLM)
ModelThreeGLM <- h2o.glm(family= "multinomial", x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified", weights_column = "weightage", alpha = 0.0, lambda_search = T, standardize = T, seed = 1,
verbose = T,
cat("\n\n Mean accuracy of GLM Model (on cross validation):",ModelThreeGLM@model$cross_validation_metrics_summary[1,1],"\n\n") perf_GLM <- h2o.performance(model = ModelThreeGLM, newdata = testPCA) cat("\n\n Accuracy of GLM Model (on test data):",1 - perf_GLM@metrics$mean_per_class_error,"\n\n")
Train and cross validate a Gradient Boosting Machine (GBM)
ModelFourGBM <- h2o.gbm(x = setdiff(colnames(trainPCA),depVars), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified", weights_column = "weightage", ntrees = 10, max_depth = 20, seed = 1, learn_rate = 0.05, learn_rate_annealing = 0.99,
verbose = T,
cat("\n\n Mean accuracy of GBM Model (on cross validation):",ModelFourGBM@model$cross_validation_metrics_summary[1,1],"\n\n") perf_GBM <- h2o.performance(model = ModelFourGBM, newdata = testPCA) cat("\n\n Accuracy of GBM Model (on test data):",1 - perf_GBM@metrics$mean_per_class_error,"\n\n")
Train and cross validate a Naïve Bayes Model
ModelFiveBayes <- h2o.naiveBayes(x = setdiff(colnames(trainPCA),c(depVars,"weightage")), y = depVars, training_frame = trainPCA, nfolds = nfolds, fold_assignment = "Stratified",
weights_column = "weightage",
cat("\n\n Mean accuracy of Naive Bayes Model (on cross validation):",ModelFiveBayes@model$cross_validation_metrics_summary[1,1],"\n\n") perf_Bayes <- h2o.performance(model = ModelFiveBayes, newdata = testPCA) cat("\n\n Accuracy of Naive Bayes Model (on test data):",1 - perf_Bayes@metrics$mean_per_class_error,"\n\n")
ERROR
Train a stacked ensemble using the GBM and RF above
ensemble <- h2o.stackedEnsemble(x = setdiff(colnames(trainPCA),c(depVars,"weightage")), y = depVars, training_frame = trainPCA,
model_id = paste0(i,"ModelEnsemble",modelIteration),
WORKS (breaks when you add XGBoost or Naive Bayes)
Train a stacked ensemble using the GBM and RF above
ensemble <- h2o.stackedEnsemble(#x = setdiff(colnames(trainPCA),c(depVars,"weightage")), y = depVars, training_frame = trainPCA,
model_id = paste0(i,"ModelEnsemble",modelIteration),
{code}