Open exalate-issue-sync[bot] opened 1 year ago
Erin LeDell commented: To clarify: The issue is that when using the balance_classes
argument, the minority class will produce predicted values that are lower than expected. In the example above, the preds from the upsampled GBM range from [0,0.5] instead of [0,1]. We expect to see the same distribution of predicted values regardless of whether balance_classes
is used.
Arno Candel commented: what does DL / DRF do?
Arno Candel commented: library(h2o) h<-h2o.init() setwd("/users/arno/h2o-3") covtype <- h2o.uploadFile("smalldata/covtype/covtype.20k.data") covtype[,55] <- covtype[,55]==6 covtype[,55] <- as.factor(covtype[,55]) s <- h2o.runif(covtype) train <- covtype[s <= 0.8,] valid <- covtype[s > 0.8,]
library(caret) cov_df <- as.data.frame(train[,55]) cov_df$C55 <- as.factor(cov_df$C55) train_df <- as.data.frame(train[,-55]) train_balanced <- as.h2o(upSample(train_df,cov_df$C55)) dim(train)
################## R GBM imbalanced_gbm <-gbm(C55~.,distribution = "bernoulli",verbose = T, data = as.data.frame(train),n.trees = 10,interaction.depth = 5,n.minobsinnode = 5,shrinkage = .1,bag.fraction = 1,train.fraction = 1)
balanced_gbm <-gbm(Class~.,distribution = "bernoulli",verbose = T, data = as.data.frame(train_balanced),n.trees = 10,interaction.depth = 5,n.minobsinnode = 5,shrinkage = .1,bag.fraction = 1,train.fraction = 1)
hist(predict(imbalanced_gbm,newdata = valid,type = "response",n.trees=10)) hist(predict(balanced_gbm,newdata = valid,type = "response",n.trees=10))
################## GBM hh_imbalanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli") hh_balanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_balanced,distribution="bernoulli") hh_balanced_gbm_bc <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli",balance_classes=T)
pred_imbalanced_gbm <- predict(hh_imbalanced_gbm, valid) pred_imbalanced_p1_df_gbm <- as.data.frame(pred_imbalanced_gbm$p1) hist(pred_imbalanced_p1_df_gbm$p1) # OK
pred_balanced_gbm <- predict(hh_balanced_gbm, valid) pred_balanced_p1_df_gbm <- as.data.frame(pred_balanced_gbm$p1) hist(pred_balanced_p1_df_gbm$p1) # OK
pred_balanced_gbm_bc <- predict(hh_balanced_gbm_bc, valid) pred_balanced_p1_df_gbm_bc <- as.data.frame(pred_balanced_gbm_bc$p1) hist(pred_balanced_p1_df_gbm_bc$p1) ############################################ LIMITED RANGE ###########################################
h2o.auc(hh_imbalanced_gbm) h2o.auc(hh_balanced_gbm) h2o.auc(hh_balanced_gbm_bc) plot(hh_imbalanced_gbm) plot(hh_balanced_gbm) plot(hh_balanced_gbm_bc)
################## GLM hh_imbalanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train,family="binomial") hh_balanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train_balanced,family="binomial")
pred_imbalanced_glm <- predict(hh_imbalanced_glm, valid) pred_imbalanced_p1_df_glm <- as.data.frame(pred_imbalanced_glm$p1) hist(pred_imbalanced_p1_df_glm$p1) # OK
pred_balanced_glm <- predict(hh_balanced_glm, valid) pred_balanced_p1_df_glm <- as.data.frame(pred_balanced_glm$p1) hist(pred_balanced_p1_df_glm$p1) # OK
################## DRF hh_imbalanced_drf <-h2o.randomForest(x=c(1:54),y=55,training_frame=train) hh_balanced_drf <-h2o.randomForest(x=c(1:54),y=55,training_frame=train_balanced) hh_balanced_drf_bc <-h2o.randomForest(x=c(1:54),y=55,training_frame=train, balance_classes=T,binomial_double_trees=F,ntrees=1)
pred_imbalanced_drf <- predict(hh_imbalanced_drf, valid) pred_imbalanced_p1_df_drf <- as.data.frame(pred_imbalanced_drf$p1) hist(pred_imbalanced_p1_df_drf$p1) # OK
pred_balanced_drf <- predict(hh_balanced_drf, valid) pred_balanced_p1_df_drf <- as.data.frame(pred_balanced_drf$p1) hist(pred_balanced_p1_df_drf$p1) # OK
pred_balanced_drf_bc <- predict(hh_balanced_drf_bc, valid) pred_balanced_p1_df_drf_bc <- as.data.frame(pred_balanced_drf_bc$p1) hist(pred_balanced_p1_df_drf_bc$p1) # OK
################## DL hh_imbalanced_dl <-h2o.deeplearning(x=c(1:54),y=55,training_frame=train) hh_balanced_dl <-h2o.deeplearning(x=c(1:54),y=55,training_frame=train_balanced) hh_balanced_dl_bc <-h2o.deeplearning(x=c(1:54),y=55,training_frame=train, balance_classes=T)
pred_imbalanced_dl <- predict(hh_imbalanced_dl, valid) pred_imbalanced_p1_df_dl <- as.data.frame(pred_imbalanced_dl$p1) hist(pred_imbalanced_p1_df_dl$p1) # OK
pred_balanced_dl <- predict(hh_balanced_dl, valid) pred_balanced_p1_df_dl <- as.data.frame(pred_balanced_dl$p1) hist(pred_balanced_p1_df_dl$p1) # OK
pred_balanced_dl_bc <- predict(hh_balanced_dl_bc, valid) pred_balanced_p1_df_dl_bc <- as.data.frame(pred_balanced_dl_bc$p1) hist(pred_balanced_p1_df_dl_bc$p1) # OK
Arno Candel commented: It seems that more trees lessens this problem.
For balance_classes=F (and e.g., for learning_rate=1/10) the predictions p/10 and 1-p/10 still cover the full range of 0...1 , but p/10correction[0] and (1-p/10)correction[1] do not (even after scaling by their sum). Note that "correction" is the ratio of prior fraction to modeled fraction and can be >>1.
Arno Candel commented: http://gking.harvard.edu/files/0s.pdf Eq (27)
Arno Candel commented: public static double[] correctProbabilities(double[] scored, double[] priorClassDist, double[] modelClassDist) { double probsum=0; for( int c=1; c<scored.length; c++ ) { final double original_fraction = priorClassDist[c-1]; final double oversampled_fraction = modelClassDist[c-1]; assert(!Double.isNaN(scored[c])) : "Predicted NaN class probability"; if (original_fraction != 0 && oversampled_fraction != 0) scored[c] *= original_fraction / oversampled_fraction; probsum += scored[c]; } if (probsum>0) for (int i=1;i<scored.length;++i) scored[i] /= probsum; return scored; }
original_fraction = 0.9 and 0.1 for class0/1 oversampled_fraction = 0.5 and 0.5
what does this transformation do to the unit interval?
p0 <- function(x) { x0.9/0.5/(x0.9/0.5+(1-x)*0.2) } plot(p0)
it's a bijection of 0..1 <-> 0..1, so looks good.
p1 <- function(x) { x0.2/((1-x)0.9/0.5+x*0.2) } plot(p1)
Arno Candel commented: Look at the plot above. In short, to get a minority-class probability of > 0.6, you need the model to predict > 0.9 on the balanced dataset.
This is not easy for a GBM model that only has a few trees and limited depth. It basically doesn't overfit that quickly. Make the GBM deeper or increase the learning rate to see that the range of 0..1 is hit quicker.
Arno Candel commented: by Nidhi:
pred_imbalanced <- predict(hh_imbalanced, valid)
pred_imbalanced <- as.data.frame(predict(hh_imbalanced, valid)) summary(pred_imbalanced) predict p0 p1
Min. :0.0000 Min. :0.1119 Min. :0.03543
1st Qu.:0.0000 1st Qu.:0.5325 1st Qu.:0.06521
Median :0.0000 Median :0.9203 Median :0.07971
Mean :0.1925 Mean :0.7458 Mean :0.25420
3rd Qu.:0.0000 3rd Qu.:0.9348 3rd Qu.:0.46749
Max. :1.0000 Max. :0.9646 Max. :0.88810p0 = pred_imbalanced$p0
x = (p00.9/0.5/(p00.9/0.5+(1-p0)*0.2)) summary(x) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.5314 0.9111 0.9905 0.9233 0.9923 0.9959
x = ((1-p0)0.2/(p00.9/0.5+(1-p0)*0.2)) summary(x) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.004064 0.007691 0.009532 0.076720 0.088880 0.468600
Arno Candel commented: In short, the above confirms that our math is doing what it's doing :)
p1 0...0.9 maps to 0...0.5 as seen in the plot above.
Arno Candel commented: Behavior is understood and correct as far as we can tell.
Solution:
Higher learning rate, deeper trees, more trees. Basically, the model needs to predict the full range of 0...1 on a balanced dataset in order to predict the full range of 0...1 after correcting the probabilities. See the attached plot for the mapping of probabilities in a dataset with 10% target class.
Arno Candel commented: Even NeuroBayes (the foundation of a whole company) http://arxiv.org/pdf/1102.3876v2.pdf uses the exact same technique:
Eq (20) of http://arxiv.org/pdf/1102.3876v2.pdf is the same as Eq (28) of http://gking.harvard.edu/files/0s.pdf
Arno Candel commented: From Nidhi: http://www.researchgate.net/publication/24395913_Balanced_gradient_boosting_from_imbalanced_data_for_clinical_outcome_prediction
Arno Candel commented: Attached oversampled stratified version of the file.
arno@lappy:~$ cat stratified.csv | wc -l 28568 arno@lappy:~$ cat stratified.csv | sort | uniq | wc -l 16019
Arno Candel commented: train_mybalanced <- h2o.importFile("/users/arno/stratified.csv") train_mybalanced$C55 <- as.factor(train_mybalanced$C55)
hh_balanced_gbm_bc_my <-h2o.gbm(x=1:ncol(train_mybalanced),y="C55",ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_mybalanced,distribution="bernoulli",balance_classes=F)
pred_balanced_gbm_bc_my <- predict(hh_balanced_gbm_bc_my, valid) pred_balanced_p1_df_gbm_bc_my <- as.data.frame(pred_balanced_gbm_bc_my$p1) hist(pred_balanced_p1_df_gbm_bc_my$p1, (0:100)*0.01)
100-bin histogram of predictions on the validation set is very similar to the one on train_balanced. So the H2O-internal sampling is fine.
Arno Candel commented: library(h2o) h<-h2o.init() setwd("/users/arno/h2o-3") covtype <- h2o.uploadFile("smalldata/covtype/covtype.20k.data") covtype[,55] <- covtype[,55]==6 covtype[,55] <- as.factor(covtype[,55]) s <- h2o.runif(covtype) train <- covtype[s <= 0.8,] valid <- covtype[s > 0.8,]
library(caret) cov_df <- as.data.frame(train[,55]) cov_df$C55 <- as.factor(cov_df$C55) train_df <- as.data.frame(train[,-55]) train_balanced <- as.h2o(upSample(train_df,cov_df$C55)) dim(train)
train_mybalanced <- h2o.importFile("/users/arno/stratified.csv") train_mybalanced train_mybalanced$C55 <- as.factor(train_mybalanced$C55)
hh_imbalanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli") hh_balanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_balanced,distribution="bernoulli") hh_balanced_gbm_bc <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train,distribution="bernoulli",balance_classes=T) hh_balanced_gbm_bc_my <-h2o.gbm(x=1:ncol(train_mybalanced),y="C55",ntrees=10,min_rows=5,learn_rate=0.1,training_frame=train_mybalanced,distribution="bernoulli",balance_classes=F)
pred_imbalanced_gbm <- predict(hh_imbalanced_gbm, valid) pred_imbalanced_p1_df_gbm <- as.data.frame(pred_imbalanced_gbm$p1) hist(pred_imbalanced_p1_df_gbm$p1, (0:100)*0.01) # OK
pred_balanced_gbm <- predict(hh_balanced_gbm, valid) pred_balanced_p1_df_gbm <- as.data.frame(pred_balanced_gbm$p1) hist(pred_balanced_p1_df_gbm$p1, (0:100)*0.01) # OK
pred_balanced_gbm_bc <- predict(hh_balanced_gbm_bc, valid) pred_balanced_p1_df_gbm_bc <- as.data.frame(pred_balanced_gbm_bc$p1) hist(pred_balanced_p1_df_gbm_bc$p1, (0:100)*0.01) ############################################ LIMITED RANGE ###########################################
pred_balanced_gbm_bc_my <- predict(hh_balanced_gbm_bc_my, valid) pred_balanced_p1_df_gbm_bc_my <- as.data.frame(pred_balanced_gbm_bc_my$p1) hist(pred_balanced_p1_df_gbm_bc_my$p1, (0:100)*0.01)
Arno Candel commented: This will lead to a fine model with great predictions:
hh_balanced_gbm_bc <-h2o.gbm(x=c(1:54),y=55,ntrees=50,min_rows=5,learn_rate=.5,validation_frame=valid,training_frame=train,distribution="bernoulli",balance_classes=T)
Again, especially for balance_classes=T, the algorithm has to make confident predictions for the posterior adjusted probabilities to be spanning the full range. In short, pick a model for which the validation error has converged (or is about to go up).
Arno Candel commented: TODO: Repro this behavior in R's GBM.
Arno Candel commented: From Hank: http://www.datascienceassn.org/sites/default/files/Predicting%20good%20probabilities%20with%20supervised%20learning.pdf
JIRA Issue Migration Info
Jira Issue: PUBDEV-1774 Assignee: Arno Candel Reporter: Eric Eckstrand State: Reopened Fix Version: N/A Attachments: Available (Count: 2) Development PRs: N/A
Attachments From Jira
Attachment Name: Screen Shot 2015-07-30 at 6.16.54 PM.png Attached By: Arno Candel File Link:https://h2o-3-jira-github-migration.s3.amazonaws.com/PUBDEV-1774/Screen Shot 2015-07-30 at 6.16.54 PM.png
Attachment Name: stratified.csv Attached By: Arno Candel File Link:https://h2o-3-jira-github-migration.s3.amazonaws.com/PUBDEV-1774/stratified.csv
covtype <- h2o.uploadFile("smalldata/covtype/covtype.20k.data") covtype[,55] <- covtype[,55]==6 covtype[,55] <- as.factor(covtype[,55]) s <- h2o.runif(covtype) train <- covtype[s <= 0.8,] valid <- covtype[s > 0.8,] library(caret) cov_df <- as.data.frame(train[,55]) cov_df$C55 <- as.factor(cov_df$C55) train_df <- as.data.frame(train[,-55]) train_balanced <- as.h2o(upSample(train_df,cov_df$C55))
################## hh_imbalanced<-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train,distribution="bernoulli",balance_classes=F) hh_balanced <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train,distribution="bernoulli",balance_classes=T)
pred_imbalanced <- predict(hh_imbalanced, valid) pred_imbalanced_p1_df <- as.data.frame(pred_imbalanced$p1) pred_balanced <- predict(hh_balanced, valid) pred_balanced_p1_df <- as.data.frame(pred_balanced$p1) hist(pred_balanced_p1_df$p1) # **** this histogram is suspect ****
################## hh_imbalanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train,family="binomial") hh_balanced_glm<-h2o.glm(x=c(1:54),y=55,training_frame=train_balanced,family="binomial")
pred_imbalanced_glm <- predict(hh_imbalanced_glm, valid) pred_imbalanced_p1_df_glm <- as.data.frame(pred_imbalanced_glm$p1) pred_balanced_glm <- predict(hh_balanced_glm, valid) pred_balanced_p1_df_glm <- as.data.frame(pred_balanced_glm$p1) hist(pred_balanced_p1_df_glm$p1) # this histogram is NOT suspect
################## hh_imbalanced_gbm<-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train,distribution="bernoulli") hh_balanced_gbm <-h2o.gbm(x=c(1:54),y=55,ntrees=10,min_rows=5,learn_rate=0.2,training_frame=train_balanced,distribution="bernoulli")
pred_imbalanced_gbm <- predict(hh_imbalanced_gbm, valid) pred_imbalanced_p1_df_gbm <- as.data.frame(pred_imbalanced_gbm$p1) pred_balanced_gbm <- predict(hh_balanced_gbm, valid) pred_balanced_p1_df_gbm <- as.data.frame(pred_balanced_gbm$p1) hist(pred_balanced_p1_df_gbm$p1) # this histogram is NOT suspect