setting gbm's balance_classes to True produces suspect models

exalate-issue-sync[bot] commented 1 year ago

covtype <- h2o.uploadFile("smalldata/covtype/covtype.20k.data") covtype[,55] <- covtype[,55]==6 covtype[,55] <- as.factor(covtype[,55]) s <- h2o.runif(covtype) train <- covtype[s <= 0.8,] valid <- covtype[s > 0.8,] library(caret) cov_df <- as.data.frame(train[,55]) cov_df$C55 <- as.factor(cov_df$C55) train_df <- as.data.frame(train[,-55]) train_balanced <- as.h2o(upSample(train_df,cov_df$C55))

################## hh_imbalanced<-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train,distribution="bernoulli",balance_classes=F) hh_balanced <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train,distribution="bernoulli",balance_classes=T)

pred_imbalanced <- predict(hh_imbalanced, valid) pred_imbalanced_p1_df <- as.data.frame(pred_imbalanced$p1) pred_balanced <- predict(hh_balanced, valid) pred_balanced_p1_df <- as.data.frame(pred_balanced$p1) hist(pred_balanced_p1_df$p1) # **** this histogram is suspect ****

################## hh_imbalanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train,family="binomial") hh_balanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train_balanced,family="binomial")

pred_imbalanced_glm <- predict(hh_imbalanced_glm, valid) pred_imbalanced_p1_df_glm <- as.data.frame(pred_imbalanced_glm$p1) pred_balanced_glm <- predict(hh_balanced_glm, valid) pred_balanced_p1_df_glm <- as.data.frame(pred_balanced_glm$p1) hist(pred_balanced_p1_df_glm$p1) # this histogram is NOT suspect

################## hh_imbalanced_gbm<-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train,distribution="bernoulli") hh_balanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train_balanced,distribution="bernoulli")

pred_imbalanced_gbm <- predict(hh_imbalanced_gbm, valid) pred_imbalanced_p1_df_gbm <- as.data.frame(pred_imbalanced_gbm$p1) pred_balanced_gbm <- predict(hh_balanced_gbm, valid) pred_balanced_p1_df_gbm <- as.data.frame(pred_balanced_gbm$p1) hist(pred_balanced_p1_df_gbm$p1) # this histogram is NOT suspect

exalate-issue-sync[bot] commented 1 year ago

Erin LeDell commented: To clarify: The issue is that when using the balance_classes argument, the minority class will produce predicted values that are lower than expected. In the example above, the preds from the upsampled GBM range from [0,0.5] instead of [0,1]. We expect to see the same distribution of predicted values regardless of whether balance_classes is used.

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: what does DL / DRF do?

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: library(h2o) h<-h2o.init() setwd("/users/arno/h2o-3") covtype <- h2o.uploadFile("smalldata/covtype/covtype.20k.data") covtype[,55] <- covtype[,55]==6 covtype[,55] <- as.factor(covtype[,55]) s <- h2o.runif(covtype) train <- covtype[s <= 0.8,] valid <- covtype[s > 0.8,]

library(caret) cov_df <- as.data.frame(train[,55]) cov_df$C55 <- as.factor(cov_df$C55) train_df <- as.data.frame(train[,-55]) train_balanced <- as.h2o(upSample(train_df,cov_df$C55)) dim(train)

################## R GBM imbalanced_gbm <-gbm(C55~.,distribution = "bernoulli",verbose = T, data = as.data.frame(train),n.trees = 10,interaction.depth = 5,n.minobsinnode = 5,shrinkage = .1,bag.fraction = 1,train.fraction = 1)

balanced_gbm <-gbm(Class~.,distribution = "bernoulli",verbose = T, data = as.data.frame(train_balanced),n.trees = 10,interaction.depth = 5,n.minobsinnode = 5,shrinkage = .1,bag.fraction = 1,train.fraction = 1)

hist(predict(imbalanced_gbm,newdata = valid,type = "response",n.trees=10)) hist(predict(balanced_gbm,newdata = valid,type = "response",n.trees=10))

################## GBM hh_imbalanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli") hh_balanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_balanced,distribution="bernoulli") hh_balanced_gbm_bc <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli",balance_classes=T)

pred_imbalanced_gbm <- predict(hh_imbalanced_gbm, valid) pred_imbalanced_p1_df_gbm <- as.data.frame(pred_imbalanced_gbm$p1) hist(pred_imbalanced_p1_df_gbm$p1) # OK

pred_balanced_gbm <- predict(hh_balanced_gbm, valid) pred_balanced_p1_df_gbm <- as.data.frame(pred_balanced_gbm$p1) hist(pred_balanced_p1_df_gbm$p1) # OK

pred_balanced_gbm_bc <- predict(hh_balanced_gbm_bc, valid) pred_balanced_p1_df_gbm_bc <- as.data.frame(pred_balanced_gbm_bc$p1) hist(pred_balanced_p1_df_gbm_bc$p1) ############################################ LIMITED RANGE ###########################################

h2o.auc(hh_imbalanced_gbm) h2o.auc(hh_balanced_gbm) h2o.auc(hh_balanced_gbm_bc) plot(hh_imbalanced_gbm) plot(hh_balanced_gbm) plot(hh_balanced_gbm_bc)

################## GLM hh_imbalanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train,family="binomial") hh_balanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train_balanced,family="binomial")

pred_imbalanced_glm <- predict(hh_imbalanced_glm, valid) pred_imbalanced_p1_df_glm <- as.data.frame(pred_imbalanced_glm$p1) hist(pred_imbalanced_p1_df_glm$p1) # OK

pred_balanced_glm <- predict(hh_balanced_glm, valid) pred_balanced_p1_df_glm <- as.data.frame(pred_balanced_glm$p1) hist(pred_balanced_p1_df_glm$p1) # OK

################## DRF hh_imbalanced_drf <-h2o.randomForest(x=c(1:54),y=55,training_frame=train) hh_balanced_drf <-h2o.randomForest(x=c(1:54),y=55,training_frame=train_balanced) hh_balanced_drf_bc <-h2o.randomForest(x=c(1:54),y=55,training_frame=train, balance_classes=T,binomial_double_trees=F,ntrees=1)

pred_imbalanced_drf <- predict(hh_imbalanced_drf, valid) pred_imbalanced_p1_df_drf <- as.data.frame(pred_imbalanced_drf$p1) hist(pred_imbalanced_p1_df_drf$p1) # OK

pred_balanced_drf <- predict(hh_balanced_drf, valid) pred_balanced_p1_df_drf <- as.data.frame(pred_balanced_drf$p1) hist(pred_balanced_p1_df_drf$p1) # OK

pred_balanced_drf_bc <- predict(hh_balanced_drf_bc, valid) pred_balanced_p1_df_drf_bc <- as.data.frame(pred_balanced_drf_bc$p1) hist(pred_balanced_p1_df_drf_bc$p1) # OK

################## DL hh_imbalanced_dl <-h2o.deeplearning(x=c(1:54),y=55,training_frame=train) hh_balanced_dl <-h2o.deeplearning(x=c(1:54),y=55,training_frame=train_balanced) hh_balanced_dl_bc <-h2o.deeplearning(x=c(1:54),y=55,training_frame=train, balance_classes=T)

pred_imbalanced_dl <- predict(hh_imbalanced_dl, valid) pred_imbalanced_p1_df_dl <- as.data.frame(pred_imbalanced_dl$p1) hist(pred_imbalanced_p1_df_dl$p1) # OK

pred_balanced_dl <- predict(hh_balanced_dl, valid) pred_balanced_p1_df_dl <- as.data.frame(pred_balanced_dl$p1) hist(pred_balanced_p1_df_dl$p1) # OK

pred_balanced_dl_bc <- predict(hh_balanced_dl_bc, valid) pred_balanced_p1_df_dl_bc <- as.data.frame(pred_balanced_dl_bc$p1) hist(pred_balanced_p1_df_dl_bc$p1) # OK

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: It seems that more trees lessens this problem.

For balance_classes=F (and e.g., for learning_rate=1/10) the predictions p/10 and 1-p/10 still cover the full range of 0...1 , but p/10correction[0] and (1-p/10)correction[1] do not (even after scaling by their sum). Note that "correction" is the ratio of prior fraction to modeled fraction and can be >>1.

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: http://gking.harvard.edu/files/0s.pdf Eq (27)

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: public static double[] correctProbabilities(double[] scored, double[] priorClassDist, double[] modelClassDist) { double probsum=0; for( int c=1; c<scored.length; c++ ) { final double original_fraction = priorClassDist[c-1]; final double oversampled_fraction = modelClassDist[c-1]; assert(!Double.isNaN(scored[c])) : "Predicted NaN class probability"; if (original_fraction != 0 && oversampled_fraction != 0) scored[c] *= original_fraction / oversampled_fraction; probsum += scored[c]; } if (probsum>0) for (int i=1;i<scored.length;++i) scored[i] /= probsum; return scored; }

original_fraction = 0.9 and 0.1 for class0/1 oversampled_fraction = 0.5 and 0.5

what does this transformation do to the unit interval?

This plots p0 as a function of the pre-corrected probability

p0 <- function(x) { x0.9/0.5/(x0.9/0.5+(1-x)*0.2) } plot(p0)

it's a bijection of 0..1 <-> 0..1, so looks good.

This plots p1 as a function of the pre-corrected probability

p1 <- function(x) { x0.2/((1-x)0.9/0.5+x*0.2) } plot(p1)

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: Look at the plot above. In short, to get a minority-class probability of > 0.6, you need the model to predict > 0.9 on the balanced dataset.

This is not easy for a GBM model that only has a few trees and limited depth. It basically doesn't overfit that quickly. Make the GBM deeper or increase the learning rate to see that the range of 0..1 is hit quicker.

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: by Nidhi:

pred_imbalanced <- predict(hh_imbalanced, valid)

pred_imbalanced <- as.data.frame(predict(hh_imbalanced, valid)) summary(pred_imbalanced) predict p0 p1
Min. :0.0000 Min. :0.1119 Min. :0.03543
1st Qu.:0.0000 1st Qu.:0.5325 1st Qu.:0.06521
Median :0.0000 Median :0.9203 Median :0.07971
Mean :0.1925 Mean :0.7458 Mean :0.25420
3rd Qu.:0.0000 3rd Qu.:0.9348 3rd Qu.:0.46749
Max. :1.0000 Max. :0.9646 Max. :0.88810

p0 = pred_imbalanced$p0

x = (p00.9/0.5/(p00.9/0.5+(1-p0)*0.2)) summary(x) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.5314 0.9111 0.9905 0.9233 0.9923 0.9959

x = ((1-p0)0.2/(p00.9/0.5+(1-p0)*0.2)) summary(x) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.004064 0.007691 0.009532 0.076720 0.088880 0.468600

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: In short, the above confirms that our math is doing what it's doing :)

p1 0...0.9 maps to 0...0.5 as seen in the plot above.

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: Behavior is understood and correct as far as we can tell.

Solution:

Higher learning rate, deeper trees, more trees. Basically, the model needs to predict the full range of 0...1 on a balanced dataset in order to predict the full range of 0...1 after correcting the probabilities. See the attached plot for the mapping of probabilities in a dataset with 10% target class.

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: Even NeuroBayes (the foundation of a whole company) http://arxiv.org/pdf/1102.3876v2.pdf uses the exact same technique:

Eq (20) of http://arxiv.org/pdf/1102.3876v2.pdf is the same as Eq (28) of http://gking.harvard.edu/files/0s.pdf

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: From Nidhi: http://www.researchgate.net/publication/24395913_Balanced_gradient_boosting_from_imbalanced_data_for_clinical_outcome_prediction

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: Attached oversampled stratified version of the file.

arno@lappy:~$ cat stratified.csv | wc -l 28568 arno@lappy:~$ cat stratified.csv | sort | uniq | wc -l 16019

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: train_mybalanced <- h2o.importFile("/users/arno/stratified.csv") train_mybalanced$C55 <- as.factor(train_mybalanced$C55)

hh_balanced_gbm_bc_my <-h2o.gbm(x=1:ncol(train_mybalanced),y="C55",ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_mybalanced,distribution="bernoulli",balance_classes=F)

pred_balanced_gbm_bc_my <- predict(hh_balanced_gbm_bc_my, valid) pred_balanced_p1_df_gbm_bc_my <- as.data.frame(pred_balanced_gbm_bc_my$p1) hist(pred_balanced_p1_df_gbm_bc_my$p1, (0:100)*0.01)

100-bin histogram of predictions on the validation set is very similar to the one on train_balanced. So the H2O-internal sampling is fine.

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: library(h2o) h<-h2o.init() setwd("/users/arno/h2o-3") covtype <- h2o.uploadFile("smalldata/covtype/covtype.20k.data") covtype[,55] <- covtype[,55]==6 covtype[,55] <- as.factor(covtype[,55]) s <- h2o.runif(covtype) train <- covtype[s <= 0.8,] valid <- covtype[s > 0.8,]

library(caret) cov_df <- as.data.frame(train[,55]) cov_df$C55 <- as.factor(cov_df$C55) train_df <- as.data.frame(train[,-55]) train_balanced <- as.h2o(upSample(train_df,cov_df$C55)) dim(train)

train_mybalanced <- h2o.importFile("/users/arno/stratified.csv") train_mybalanced train_mybalanced$C55 <- as.factor(train_mybalanced$C55)

hh_imbalanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli") hh_balanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_balanced,distribution="bernoulli") hh_balanced_gbm_bc <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli",balance_classes=T) hh_balanced_gbm_bc_my <-h2o.gbm(x=1:ncol(train_mybalanced),y="C55",ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_mybalanced,distribution="bernoulli",balance_classes=F)

pred_imbalanced_gbm <- predict(hh_imbalanced_gbm, valid) pred_imbalanced_p1_df_gbm <- as.data.frame(pred_imbalanced_gbm$p1) hist(pred_imbalanced_p1_df_gbm$p1, (0:100)*0.01) # OK

pred_balanced_gbm <- predict(hh_balanced_gbm, valid) pred_balanced_p1_df_gbm <- as.data.frame(pred_balanced_gbm$p1) hist(pred_balanced_p1_df_gbm$p1, (0:100)*0.01) # OK

pred_balanced_gbm_bc <- predict(hh_balanced_gbm_bc, valid) pred_balanced_p1_df_gbm_bc <- as.data.frame(pred_balanced_gbm_bc$p1) hist(pred_balanced_p1_df_gbm_bc$p1, (0:100)*0.01) ############################################ LIMITED RANGE ###########################################

pred_balanced_gbm_bc_my <- predict(hh_balanced_gbm_bc_my, valid) pred_balanced_p1_df_gbm_bc_my <- as.data.frame(pred_balanced_gbm_bc_my$p1) hist(pred_balanced_p1_df_gbm_bc_my$p1, (0:100)*0.01)

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: This will lead to a fine model with great predictions:

hh_balanced_gbm_bc <-h2o.gbm(x=c(1:54),y=55,ntrees=50,min_rows=5,learn_rate=.5,validation_frame=valid,training_frame=train,distribution="bernoulli",balance_classes=T)

Again, especially for balance_classes=T, the algorithm has to make confident predictions for the posterior adjusted probabilities to be spanning the full range. In short, pick a model for which the validation error has converged (or is about to go up).

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: TODO: Repro this behavior in R's GBM.

exalate-issue-sync[bot] commented 1 year ago

Arno Candel commented: From Hank: http://www.datascienceassn.org/sites/default/files/Predicting%20good%20probabilities%20with%20supervised%20learning.pdf

DinukaH2O commented 1 year ago

JIRA Issue Migration Info

Jira Issue: PUBDEV-1774 Assignee: Arno Candel Reporter: Eric Eckstrand State: Reopened Fix Version: N/A Attachments: Available (Count: 2) Development PRs: N/A

Attachments From Jira

Attachment Name: Screen Shot 2015-07-30 at 6.16.54 PM.png Attached By: Arno Candel File Link:https://h2o-3-jira-github-migration.s3.amazonaws.com/PUBDEV-1774/Screen Shot 2015-07-30 at 6.16.54 PM.png

Attachment Name: stratified.csv Attached By: Arno Candel File Link:https://h2o-3-jira-github-migration.s3.amazonaws.com/PUBDEV-1774/stratified.csv

h2oai / h2o-3

setting gbm's balance_classes to True produces suspect models #14737

This plots p0 as a function of the pre-corrected probability

This plots p1 as a function of the pre-corrected probability