Possible bug in learnings_curve_dat()

j-kreis commented 6 years ago

I think there is a bug in the caret::learing_curve_dat () function (#278). The performance calculation for the test set is done on the model predictions of both, the test and train data set (line 36 of the function).

Here is my minimal example. I added a few prints to the caret::learing_curve_dat () function, to show the true performance compared to the one which is currently used.

Additionally, the typo in the function name was not fixed, mentioned in #278.

Minimal, reproducible example:

require(plyr)
require(caret)
data(mtcars)

learing_curve_dat <- function(dat, 
                              outcome = NULL,
                              proportion = (1:10)/10, 
                              test_prop = 0, 
                              verbose = TRUE, ...) {
  if(is.null(outcome))
    stop("Please give a character stirng for the outcome column name")
  proportion <- sort(unique(proportion))
  n_size <- length(proportion)

  if(test_prop > 0) {
    for_model <- createDataPartition(dat[, outcome], p = 1 - test_prop, list = FALSE)
  } else for_model <- 1:nrow(dat)

  n <- length(for_model)

  resampled <- vector(mode = "list", length = n_size)
  tested <- if(test_prop > 0) resampled else NULL
  apparent <- resampled
  for(i in seq(along = proportion)) {
    if(verbose) cat("\nTraining for ", round(proportion[i]*100, 1), 
                    "% (n = ", floor(n*proportion[i]), ")\n", sep = "")
    in_mod <- if(proportion[i] < 1) sample(for_model, size = floor(n*proportion[i])) else for_model
    mod <- train(x = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
                 y = dat[in_mod, outcome],
                 ...)

    if(i == 1) perf_names <- mod$perfNames
    resampled[[i]] <- merge(mod$resample, mod$bestTune)
    resampled[[i]]$Training_Size <- length(in_mod)

    if(test_prop > 0) {
      if(!mod$control$classProbs) {
        test_preds <- extractPrediction(list(model = mod), 
                                        testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
                                        testY = dat[-for_model, outcome])
      } else {
        test_preds <- extractProb(list(model = mod), 
                                  testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
                                  testY = dat[-for_model, outcome])
      }
      true_perf = plyr::ddply(test_preds, .(dataType), 
                              function(x) mod$control$summaryFunction(x, lev = mod$finalModel$obsLevels))
      test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels)
      print(rbind(data.frame(dataType="Total", 
                             RMSE=test_perf[1], 
                             Rsquared=test_perf[2], 
                             MAE=test_perf[3], row.names = 0), 
                  true_perf))
      test_perf <- as.data.frame(t(test_perf))
      test_perf$Training_Size <- length(in_mod)
      tested[[i]] <- test_perf
      try(rm(test_preds, test_perf), silent = TRUE)
    }

    if(!mod$control$classProbs) {
      app_preds <- extractPrediction(list(model = mod), 
                                     testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
                                     testY = dat[in_mod, outcome])
    } else {
      app_preds <- extractProb(list(model = mod), 
                               testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
                               testY = dat[in_mod, outcome])
    }
    app_perf <- mod$control$summaryFunction(app_preds, lev = mod$finalModel$obsLevels)
    app_perf <- as.data.frame(t(app_perf))
    app_perf$Training_Size <- length(in_mod)    
    apparent[[i]] <- app_perf

    try(rm(mod, in_mod, app_preds, app_perf), silent = TRUE)
  }

  resampled <- do.call("rbind", resampled)
  resampled <- resampled[, c(perf_names, "Training_Size")]
  resampled$Data <- "Resampling"
  apparent <- do.call("rbind", apparent)
  apparent <- apparent[, c(perf_names, "Training_Size")]
  apparent$Data <- "Training"
  out <- rbind(resampled, apparent)
  if(test_prop > 0) {
    tested <- do.call("rbind", tested)
    tested <- tested[, c(perf_names, "Training_Size")]
    tested$Data <- "Testing"
    out <- rbind(out, tested)
  }
  out
}
dmy = learing_curve_dat(dat=mtcars, outcome = "disp", test_prop = .2, proportion=(1:3)/3)

The result of this code shows, that the resulting value of the function only returns the mixed performance, which is calculated on both, the training and test set (see data Type columns).


Training for 33.3% (n = 9)
  dataType     RMSE  Rsquared      MAE
0    Total 35.30311 0.9450565 26.89221
1     Test 52.65220 0.9229936 40.24089
2 Training 23.83509 0.9763854 20.95947

Training for 66.7% (n = 18)
  dataType     RMSE  Rsquared      MAE
0    Total 29.41880 0.9516958 20.12238
1     Test 56.26877 0.7613311 47.72862
2 Training 18.82012 0.9829672 13.98765

Training for 100% (n = 28)
  dataType     RMSE  Rsquared      MAE
0    Total 26.21689 0.9606756 20.01071
1     Test 31.41360 0.9393171 29.30161
2 Training 25.38782 0.9634528 18.68344

Session Info:


sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=German_Germany.1252  LC_CTYPE=German_Germany.1252    LC_MONETARY=German_Germany.1252
[4] LC_NUMERIC=C                    LC_TIME=German_Germany.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] caret_6.0-78    ggplot2_2.2.1   lattice_0.20-35 plyr_1.8.4     

loaded via a namespace (and not attached):
 [1] tidyselect_0.2.4    purrr_0.2.4         reshape2_1.4.3      kernlab_0.9-25      splines_3.4.3      
 [6] colorspace_1.3-2    stats4_3.4.3        survival_2.41-3     prodlim_1.6.1       rlang_0.2.0        
[11] ModelMetrics_1.1.0  pillar_1.2.1        foreign_0.8-69      glue_1.2.0          withr_2.1.1        
[16] bindrcpp_0.2        foreach_1.4.4       bindr_0.1.1         dimRed_0.1.0        lava_1.6           
[21] robustbase_0.92-8   stringr_1.3.0       timeDate_3043.102   munsell_0.4.3       gtable_0.2.0       
[26] recipes_0.1.2       codetools_0.2-15    psych_1.7.8         parallel_3.4.3      class_7.3-14       
[31] DEoptimR_1.0-8      broom_0.4.3         Rcpp_0.12.15        scales_0.5.0        ipred_0.9-6        
[36] CVST_0.2-1          mnormt_1.5-5        stringi_1.1.7       dplyr_0.7.4         RcppRoll_0.2.2     
[41] ddalpha_1.3.1.1     grid_3.4.3          tools_3.4.3         magrittr_1.5        lazyeval_0.2.1     
[46] tibble_1.4.2        randomForest_4.6-12 tidyr_0.8.0         DRR_0.0.3           pkgconfig_2.0.1    
[51] MASS_7.3-49         Matrix_1.2-12       lubridate_1.7.3     gower_0.1.2         assertthat_0.2.0   
[56] iterators_1.0.9     R6_2.2.2            rpart_4.1-13        sfsmisc_1.1-2       nnet_7.3-12        
[61] nlme_3.1-131.1      compiler_3.4.3

nickyfoto commented 6 years ago

test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels) is wrong because it postResample the whole column of obs and pred regardless of dataType. Since test result is already calculated and included in true_perf, we can extract it and format it as a named vector in order to comply with the following code.

A temp solution is:

# test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels)     
test_perf <- as.numeric(true_perf[1,-1])
names(test_perf) <- names(true_perf[1,-1])

Of course, there should be a more elegant fix to be done.

nickyfoto commented 6 years ago

I realize that true_perf is the variable to show the bug made by @juliangkr . A possible fix is:

if(test_prop > 0) {
      if(!mod$control$classProbs) {
        test_preds <- extractPrediction(list(model = mod), 
                                        testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
                                        testY = dat[-for_model, outcome])
      } else {
        test_preds <- extractProb(list(model = mod), 
                                  testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
                                  testY = dat[-for_model, outcome])
      }
      # select only the rows with dataType Test
      test_preds <- test_preds[test_preds$dataType=="Test",]
      test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels)
      test_perf <- as.data.frame(t(test_perf))
      test_perf$Training_Size <- length(in_mod)
      tested[[i]] <- test_perf
      try(rm(test_preds, test_perf), silent = TRUE)
    }

    if(!mod$control$classProbs) {
      # it's not necessary to add extra testX since here we only need predictions for training 
      app_preds <- extractPrediction(list(model = mod))
    } else {
      app_preds <- extractProb(list(model = mod))
    }
    app_perf <- mod$control$summaryFunction(app_preds, lev = mod$finalModel$obsLevels)
    app_perf <- as.data.frame(t(app_perf))
    app_perf$Training_Size <- length(in_mod)    
    apparent[[i]] <- app_perf

    try(rm(mod, in_mod, app_preds, app_perf), silent = TRUE)
  }

topepo / caret

Possible bug in learnings_curve_dat() #866

Minimal, reproducible example:

Session Info: