boost-R / mboost

Boosting algorithms for fitting generalized linear, additive and interaction models to potentially high-dimensional data. The current relase version can be found on CRAN (http://cran.r-project.org/package=mboost).
74 stars 27 forks source link

mboost_2.6-0 and caret_6.0-70 under WINDOWS R3.3.1 hard crash #44

Closed tobigithub closed 8 years ago

tobigithub commented 8 years ago

Hard crash with memory bug and rscript termination under Windows. Example:

require(caret)
require(mboost)
require(gbm)
require(rf)

# load iris set
data(iris) 
dim(iris) 

# works 
m <- c("rf" ,"gbm")

# load X and Y (this will be transferred to to train function)
X = iris[,1:3]
Y = iris$Species

# this setup actually calls the caret::train function, in order to provide
# minimal error handling this type of construct is needed.
trainCall <- function(i) 
    {
         cat("----------------------------------------------------","\n");
         set.seed(123); cat(i," <- loaded\n");
         return(tryCatch(
                t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "cv")),
                error=function(e) NULL))
    }

# use lapply/loop to run everything, required for try/catch error function to work
t2 <- lapply(m, trainCall)

#remove NULL values, we only allow succesful methods, provenance is deleted.
t2 <- t2[!sapply(t2, is.null)]

# this setup extracts the results with minimal error handling 
# TrainKappa can be sometimes zero, but Accuracy SD can be still available
# see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html
printCall <- function(i) 
    {
         return(tryCatch(
            {
             cat(sprintf("%-22s",(m[i])))
         cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t")
         cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t")
         cat(t2[[i]]$times$everything[3],"\n")},
             error=function(e) NULL))
    }

r2 <- lapply(1:length(t2), printCall)

#-------------------------------------------------------------------------------------

# crashes using blackboost 
m <- c("rf" ,"gbm" ,"blackboost")

# load X and Y (this will be transferred to to train function)
X = iris[,1:3]
Y = iris$Species

# this setup actually calls the caret::train function, in order to provide
# minimal error handling this type of construct is needed.
trainCall <- function(i) 
    {
         cat("----------------------------------------------------","\n");
         set.seed(123); cat(i," <- loaded\n");
         return(tryCatch(
                t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "cv")),
                error=function(e) NULL))
    }

# use lapply/loop to run everything, required for try/catch error function to work
t2 <- lapply(m, trainCall)

#remove NULL values, we only allow succesful methods, provenance is deleted.
t2 <- t2[!sapply(t2, is.null)]

# this setup extracts the results with minimal error handling 
# TrainKappa can be sometimes zero, but Accuracy SD can be still available
# see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html
printCall <- function(i) 
    {
         return(tryCatch(
            {
             cat(sprintf("%-22s",(m[i])))
         cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t")
         cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t")
         cat(t2[[i]]$times$everything[3],"\n")},
             error=function(e) NULL))
    }

r2 <- lapply(1:length(t2), printCall)
> sessionInfo()
R version 3.3.1 (2016-06-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

locale:
[1] LC_COLLATE=English_United States.1252  
[4] LC_NUMERIC=C                        

attached base packages:
 [1] splines   parallel  stats4    grid      stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] gbm_2.1.1           survival_2.39-4     ada_2.0-5           rpart_4.1-10        DT_0.1              randomForest_4.6-12
 [7] doParallel_1.0.10   iterators_1.0.8     foreach_1.4.3       plyr_1.8.4          mboost_2.6-0        stabs_0.5-1        
[13] party_1.0-25        strucchange_1.5-1   sandwich_2.3-4      zoo_1.7-13          modeltools_0.2-21   mvtnorm_1.0-5      
[19] caret_6.0-70        ggplot2_2.1.0       lattice_0.20-33    

loaded via a namespace (and not attached):
 [1] coin_1.1-2         reshape2_1.4.1     colorspace_1.2-6   htmltools_0.3.5    mgcv_1.8-12        e1071_1.6-7       
 [7] nloptr_1.0.4       multcomp_1.4-5     stringr_1.0.0      MatrixModels_0.4-1 munsell_0.4.3      gtable_0.2.0      
[13] htmlwidgets_0.6    codetools_0.2-14   SparseM_1.7        quantreg_5.26      pbkrtest_0.4-6     class_7.3-14      
[19] TH.data_1.0-7      Rcpp_0.12.5        scales_0.4.0       lme4_1.1-12        digest_0.6.9       stringi_1.1.1     
[25] quadprog_1.5-5     tools_3.3.1        magrittr_1.5       car_2.1-2          MASS_7.3-45        Matrix_1.2-6      
[31] pROC_1.8           nnls_1.4           minqa_1.2.4        nnet_7.3-12        nlme_3.1-128       compiler_3.3.1    

Tobias

hofnerb commented 8 years ago

Thanks, @tobigithub for the bug report. However, I cannot reproduce your hard crash (nor can I obtain results for random forest or gbm). The list t2 is always "empty", i.e., consists only of NULL.

Furthermore, I'd guess that this is a bug related to the caret interface?! (Please use plain mboost code to show bugs in mboost. We do not really know nor control what caret does.)

> session_info()
Session info -------------------------------------------------------------------------------------------------------
 setting  value                                      
 version  R version 3.2.4 Revised (2016-03-16 r70336)
 system   x86_64, mingw32                            
 ui       RStudio (0.99.893)                         
 language (EN)                                       
 collate  German_Germany.1252                        
 tz       Europe/Berlin                              
 date     2016-07-26                                 

Packages -----------------------------------------------------------------------------------------------------------
 package      * version date       source                         
 car            2.1-2   2016-03-25 CRAN (R 3.2.5)                 
 caret        * 6.0-70  2016-06-13 CRAN (R 3.2.5)                 
 codetools      0.2-14  2015-07-15 CRAN (R 3.2.4)                 
 coin           1.1-2   2015-11-16 CRAN (R 3.2.5)                 
 colorspace     1.2-6   2015-03-11 CRAN (R 3.2.5)                 
 devtools     * 1.10.0  2016-01-23 CRAN (R 3.2.4)                 
 digest         0.6.9   2016-01-08 CRAN (R 3.2.4)                 
 foreach        1.4.3   2015-10-13 CRAN (R 3.2.5)                 
 gbm          * 2.1.1   2015-03-11 CRAN (R 3.2.5)                 
 ggplot2      * 2.1.0   2016-03-01 CRAN (R 3.2.5)                 
 gtable         0.2.0   2016-02-26 CRAN (R 3.2.5)                 
 iterators      1.0.8   2015-10-13 CRAN (R 3.2.3)                 
 lattice      * 0.20-33 2015-07-14 CRAN (R 3.2.4)                 
 lme4           1.1-12  2016-04-16 CRAN (R 3.2.5)                 
 magrittr       1.5     2014-11-22 CRAN (R 3.2.5)                 
 MASS           7.3-45  2015-11-10 CRAN (R 3.2.4)                 
 Matrix         1.2-4   2016-03-02 CRAN (R 3.2.4)                 
 MatrixModels   0.4-1   2015-08-22 CRAN (R 3.2.5)                 
 mboost       * 2.6-0   2016-07-04 Github (boost-R/mboost@b893002)
 memoise        1.0.0   2016-01-29 CRAN (R 3.2.4)                 
 mgcv           1.8-12  2016-03-03 CRAN (R 3.2.4)                 
 minqa          1.2.4   2014-10-09 CRAN (R 3.2.5)                 
 modeltools     0.2-21  2013-09-02 CRAN (R 3.2.3)                 
 multcomp       1.4-5   2016-05-04 CRAN (R 3.2.5)                 
 munsell        0.4.3   2016-02-13 CRAN (R 3.2.5)                 
 mvtnorm        1.0-5   2016-02-02 CRAN (R 3.2.3)                 
 nlme           3.1-125 2016-02-27 CRAN (R 3.2.4)                 
 nloptr         1.0.4   2014-08-04 CRAN (R 3.2.5)                 
 nnet           7.3-12  2016-02-02 CRAN (R 3.2.4)                 
 nnls           1.4     2012-03-19 CRAN (R 3.2.3)                 
 party          1.0-25  2015-11-05 CRAN (R 3.2.5)                 
 pbkrtest       0.4-6   2016-01-27 CRAN (R 3.2.5)                 
 plyr         * 1.8.3   2015-06-12 CRAN (R 3.2.5)                 
 quadprog       1.5-5   2013-04-17 CRAN (R 3.2.3)                 
 quantreg       5.26    2016-06-07 CRAN (R 3.2.5)                 
 randomForest * 4.6-12  2015-10-07 CRAN (R 3.2.5)                 
 Rcpp           0.12.5  2016-05-14 CRAN (R 3.2.5)                 
 reshape2       1.4.1   2014-12-06 CRAN (R 3.2.5)                 
 sandwich       2.3-4   2015-09-24 CRAN (R 3.2.5)                 
 scales         0.4.0   2016-02-26 CRAN (R 3.2.5)                 
 SparseM        1.7     2015-08-15 CRAN (R 3.2.3)                 
 stabs        * 0.5-2   2016-05-17 Github (hofnerb/stabs@4d0d759) 
 stringi        1.0-1   2015-10-22 CRAN (R 3.2.3)                 
 stringr        1.0.0   2015-04-30 CRAN (R 3.2.5)                 
 strucchange    1.5-1   2015-06-06 CRAN (R 3.2.5)                 
 survival     * 2.38-3  2015-07-02 CRAN (R 3.2.4)                 
 TH.data        1.0-7   2016-01-28 CRAN (R 3.2.5)                 
 zoo            1.7-13  2016-05-03 CRAN (R 3.2.5)
tobigithub commented 8 years ago

Hi, totally agree, it could be a DLL dependency issue, packet interference or caret itself, sometimes its also a class imbalance, too few data points. I just confirmed it again, once you include blackboost to be run, the following memory error occurs. I also run it in sequential mode, same thing. There are a bunch of packages masked, maybe that is the issue. I simplified it to a one liner that crashes.

# works
require(caret); X = iris[,1:3]; Y = iris$Species; train(y=Y, x=X, "knn")

# crashes
require(mboost); X = iris[,1:3]; Y = iris$Species; train(y=Y, x=X, "blackboost")

blackboost-error

Interestingly there is no issue with the PimaIndians set. Here the blackboost method runs, ranking 83 from 128 machine learning packages caret-all-binary-class-PimaIndiansDiabetes. The code is here caret-all-binary-class-PimaIndiansDiabetes.R. So I suspect it has something to do with the data structure that invokes the error.

Best way to test it would be to use a Virtual Machine and update to the latest package and then run the code below to confirm and then start debugging on the code level. I have no simple blackboost code, only the list of around 100 packages that run with the same code. This script from here will create that: iris-classification-all-fast.R One other way beyond unit testing would be probably to run the whole UCI ML archive, but surely that's a lot of work. I remember blackboost worked in older releases, so I will just wait for the next releases.

Num Name Accuracy Kappa time [s] Model name
97 vglmCumulative 0.968 0.952 2.160 Cumulative Probability Model for Ordinal Data
2 avNNet 0.968 0.951 1.870 Model Averaged Neural Network
96 vglmContRatio 0.966 0.949 2.320 Continuation Ratio Model for Ordinal Data
79 sda 0.966 0.949 1.360 Shrinkage Discriminant Analysis
13 CSimca 0.966 0.948 1.730 SIMCA
41 loclda 0.965 0.948 1.910 Localized Linear Discriminant Analysis
72 rlda 0.965 0.947 1.380 Regularized Linear Discriminant Analysis
38 lda2 0.965 0.947 0.900 Linear Discriminant Analysis
60 pda 0.965 0.947 1.030 Penalized Discriminant Analysis
6 Boruta 0.964 0.946 3.280 Random Forest with Additional Feature Selection
94 treebag 0.964 0.946 1.430 Bagged CART
1 amdai 0.964 0.945 0.830 Adaptive Mixture Discriminant Analysis
31 hdrda 0.964 0.945 1.750 High-Dimensional Regularized Discriminant Analysis
37 lda 0.964 0.945 1.030 Linear Discriminant Analysis
69 rda 0.964 0.945 1.670 Regularized Discriminant Analysis
70 rf 0.964 0.945 1.370 Random Forest
20 extraTrees 0.962 0.943 2.810 Random Forest by Randomization
52 nnet 0.962 0.943 1.430 Neural Network
56 parRF 0.962 0.943 1.310 Parallel Random Forest
95 vglmAdjCat 0.960 0.940 2.920 Adjacent Categories Probability Model for Ordinal Data
78 RSimca 0.960 0.940 1.130 Robust SIMCA
45 mda 0.960 0.939 1.200 Mixture Discriminant Analysis
25 gbm 0.959 0.939 1.260 Stochastic Gradient Boosting
67 ranger 0.959 0.938 1.170 Random Forest
68 rbfDDA 0.959 0.938 1.950 Radial Basis Function Network
53 oblique.tree 0.959 0.938 2.450 Oblique Trees
87 svmLinear 0.959 0.938 1.030 Support Vector Machines with Linear Kernel
88 svmLinear2 0.959 0.938 1.370 Support Vector Machines with Linear Kernel
59 pcaNNet 0.958 0.937 1.460 Neural Networks with Feature Extraction
27 glmnet 0.958 0.937 1.540 glmnet
30 hdda 0.956 0.933 1.100 High Dimensional Discriminant Analysis
39 Linda 0.955 0.932 1.090 Robust Linear Discriminant Analysis
21 fda 0.955 0.932 1.040 Flexible Discriminant Analysis
3 bagFDAGCV 0.954 0.931 3.290 Bagged FDA using gCV Pruning
40 LMT 0.954 0.930 2.770 Logistic Model Trees
65 qda 0.953 0.930 0.980 Quadratic Discriminant Analysis
33 JRip 0.953 0.929 1.810 Rule-Based Classifier
99 wsrf 0.952 0.928 1.240 Weighted Subspace Random Forest
36 knn 0.952 0.928 1.090 k-Nearest Neighbors
42 LogitBoost 0.952 0.927 1.260 Boosted Logistic Regression
18 earth 0.951 0.926 1.140 Multivariate Adaptive Regression Spline
100 xyf 0.951 0.926 2.090 Self-Organizing Maps
26 gcvEarth 0.951 0.926 1.230 Multivariate Adaptive Regression Splines
86 stepQDA 0.951 0.926 2.260 Quadratic Discriminant Analysis with Stepwise Feature Selection
9 C5.0 0.950 0.924 1.100 C5.0
57 PART 0.949 0.923 1.200 Rule-Based Classifier
44 lvq 0.949 0.923 1.280 Learning Vector Quantization
85 stepLDA 0.948 0.921 1.940 Linear Discriminant Analysis with Stepwise Feature Selection
10 C5.0Rules 0.947 0.920 0.760 Single C5.0 Ruleset
75 rpart1SE 0.946 0.919 1.160 CART
76 rpart2 0.946 0.918 1.180 CART
12 cforest 0.946 0.918 2.450 Conditional Inference Random Forest
11 C5.0Tree 0.945 0.918 0.800 Single C5.0 Tree
54 OneR 0.945 0.917 0.990 Single Rule Classification
32 J48 0.945 0.917 1.860 C4.5-like Trees
74 rpart 0.945 0.917 1.000 CART
77 rpartScore 0.945 0.917 1.970 CART or Ordinal Responses
58 partDSA 0.944 0.916 3.310 partDSA
5 bdk 0.944 0.915 1.900 Self-Organizing Map
14 ctree 0.944 0.915 0.950 Conditional Inference Tree
15 ctree2 0.943 0.914 1.360 Conditional Inference Tree
29 hda 0.942 0.912 2.530 Heteroscedastic Discriminant Analysis
49 mlpWeightDecay 0.939 0.909 3.790 Multi-Layer Perceptron
50 mlpWeightDecayML 0.939 0.909 3.940 Multi-Layer Perceptron multiple layers
47 mlp 0.939 0.909 2.370 Multi-Layer Perceptron
48 mlpML 0.939 0.909 2.080 Multi-Layer Perceptron with multiple layers
66 QdaCov 0.939 0.908 1.110 Robust Quadratic Discriminant Analysis
89 svmPoly 0.938 0.907 2.510 Support Vector Machines with Polynomial Kernel
91 svmRadialCost 0.934 0.900 1.300 Support Vector Machines with Radial Basis Function Kernel
90 svmRadial 0.931 0.896 1.430 Support Vector Machines with Radial Basis Function Kernel
92 svmRadialSigma 0.931 0.896 1.630 Support Vector Machines with Radial Basis Function Kernel
93 svmRadialWeights 0.931 0.896 1.390 Support Vector Machines with Class Weights
43 lssvmRadial 0.928 0.891 3.040 Least Squares Support Vector Machine with Radial Basis Function Kernel
64 protoclass 0.924 0.885 1.250 Greedy Prototype Selection
35 kknn 0.919 0.878 1.240 k-Nearest Neighbors
19 elm 0.909 0.862 1.230 Extreme Learning Machine
24 gaussprRadial 0.906 0.859 2.950 Gaussian Process with Radial Basis Function Kernel
55 pam 0.904 0.855 1.120 Nearest Shrunken Centroids
61 PenalizedLDA 0.904 0.855 1.170 Penalized Linear Discriminant Analysis
51 nb 0.904 0.855 1.110 Naive Bayes
16 dda 0.890 0.835 2.980 Diagonal Discriminant Analysis
82 slda 0.838 0.755 1.270 Stabilized Linear Discriminant Analysis
84 spls 0.809 0.713 1.530 Sparse Partial Least Squares
34 kernelpls 0.800 0.699 0.930 Partial Least Squares
63 pls 0.800 0.699 0.980 Partial Least Squares
81 simpls 0.800 0.699 0.990 Partial Least Squares
98 widekernelpls 0.800 0.699 1.100 Partial Least Squares
46 Mlda 0.739 0.537 1.060 Maximum Uncertainty Linear Discriminant Analysis
71 RFlda 0.739 0.537 1.170 Factor-Based Linear Discriminant Analysis
83 sparseLDA 0.672 0.509 1.420 Sparse Linear Discriminant Analysis
4 bayesglm 0.671 0.504 1.030 Bayesian Generalized Linear Model
8 bstSm 0.671 0.504 1.840 Boosted Smoothing Spline
22 gam 0.671 0.504 2.660 Generalized Additive Model using Splines
23 gamLoess 0.671 0.504 1.040 Generalized Additive Model using LOESS
62 plr 0.671 0.504 1.080 Penalized Logistic Regression
73 rocc 0.671 0.504 1.550 ROC-Based Classifier
80 sdwd 0.671 0.504 1.980 Sparse Distance Weighted Discrimination
7 BstLm 0.587 0.383 1.970 Boosted Linear Model
17 dnn 0.326 0.000 2.340 Stacked AutoEncoder Deep Neural Network
28 gpls 1.000 0.000 2.110 Generalized Partial Least Squares
hofnerb commented 8 years ago

Thanks, Tobias, for your updated bug report. I now could recreate your error, yet I do not easily see were it comes from.

Thus, I tried to get iris classification working in mboost directly:

blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, 
           data = iris, family = Binomial())
  Error in family@check_y(y) : 
  response is not a factor at two levels but ‘family = Binomial()’ 

Similarly if I try family = PropOdds() I get another error:

 Error in family@check_y(y) : response must be an ordered factor

So my main issue is that I do not know HOW (i.e. with which code and which options) caret calls mboost. There is also no matrix interface for blackboost, only a formula interface. Hence, per default, without manipulating the data the model cannot be fitted.

If you'd (wrongly) fit a linear model to a continuous outcome, it works:

set.seed(1234)
## fit model
mod <- blackboost(as.numeric(Species) ~ Sepal.Length + Sepal.Width + Petal.Length, 
                  data = iris)
## cross-validate tuning parameter
cvr <- cvrisk(mod)
## optimal number of boosting steps
mstop(cvr)
[1] 42
## I didn't trick this. So a nice answer ;)

## now set you model to the optimal mstop value
mstop(mod) <- mstop(cvr)

## plot the predictions
plot(iris$Species, fitted(mod))

predictions_iris

Although I consider the idea of a single interface very attractive, I think that you lose a lot of fine control. E.g. , the user needs to prespecify a maximum mstop value for mboost methods. Per default 100 is used. Yet this might be a very poor choice. Thus, I do not know what caret uses here. If chosen to low this might greately hamper boosting models.

Furthermore, the description of boosting tuning parameters at http://topepo.github.io/caret/Boosting.html is not correct. There is no such thing as prune, neither for gamboost nor for glmboost but there are other (less important) tuning parameters.

Finally, mboost is mainly about linear, additive and other complex models which are to be fitted. blackboost is just there to complete the range of models. Actually, that is not the main focus or strength of the package.

tobigithub commented 8 years ago

Hi, its an intrinsic R problem, and surely the error handling of R code sucks. Cheers Tobias

mvkorpel commented 8 years ago

@hofnerb The following is an example of how to crash R with plain mboost, without caret.

> library(mboost)
Loading required package: parallel
Loading required package: stabs
This is mboost 2.6-0. See ‘package?mboost’ and ‘news(package  = "mboost")’
for a complete list of changes.

> blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris, family = Multinomial())

 *** caught segfault ***
address 0xe10000a6, cause 'memory not mapped'

An alternative error message I get is:

 *** caught segfault ***
address (nil), cause 'unknown'

The previous errors were obtained on Linux using recent development and patched versions of R. With an older R version (3.0.2):

> blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris, family = Multinomial())
Error in R_modify_response(c(0.666666666666667, 0.666666666666667, 0.666666666666667,  : 
  REAL() can only be applied to a 'numeric', not a 'bytecode'
> blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris, family = Multinomial())
*** Error in `/usr/lib/R/bin/exec/R': free(): invalid next size (normal): 0x00000000091fe380 ***
Aborted

I did some digging with print statements and the valgrind debugger. As a result I found out that in this example, it appears that the C function R_modify_response() in the party package tries to write 300 numeric values to an array that can only hold 150, which seems to make a mess of the R internals. This happens in each of the four for loops of the function. Note that 150 is the number of samples in the iris data set.

As mboost calls some internal functions of party, i.e. party::party_intern() (as opposed to only using high level interfaces), I dare not say where the root of the problem is.

hofnerb commented 8 years ago

Without beeing a real expert on Multinomial() and on how exactly party functions are implemented, I think the error is that you tried using family = Multinomial() with something else than gamboost() (or equivalently the mboost() function). From the manual (?Multinomial) you see that there is some extra work to do to fit multinomial models:

### Multinomial logit model via a linear array model
## One needs to convert the data to a list
myiris <- as.list(iris)
## ... and define a dummy vector with one factor level less
## than the outcome, which is used as reference category.
myiris$class <- factor(levels(iris$Species)[-nlevels(iris$Species)])
## Now fit the linear array model
mlm <- mboost(Species ~ bols(Sepal.Length, df = 2) %O%
                        bols(class, df = 2, contrasts.arg = "contr.dummy"),
              data = myiris,
              family = Multinomial())
coef(mlm) ## one should use more boosting iterations.

This model cannot be properly depicted in blackboost() and glmboost(). Hence, we should actually try to check wether Multinomial was used within gamboost or mboost with the correct model formula. However, this might be non-trivial.

mvkorpel commented 8 years ago

As a low-level fix, I think the R function fitfun() (R/btree.R) could check the lengths and types of y and various slots of object@responses, which requires knowing how party::party_intern(..., fun = "R_modify_response") works. Technically, to avoid overwriting unintended parts of the memory, it is only required that each of the other vectors is at least as long as y, but I think the intent is that they should be the same length. The check would be something like the following code:

obr <- object@responses
party_arrays <- list(y, obr@transformations[[1]], obr@test_trafo,
                     obr@predict_trafo, obr@variables[[1]])
if (!all(vapply(party_arrays, is.numeric, FALSE)) ||
    length(unique(vapply(party_arrays, length, 0))) != 1) {
    stop("some error message")
}
hofnerb commented 8 years ago

Again, boosted trees are not accepable here! Hence no need to fiddle around with party. One needs special data sets and it is absolutely required to use the linear array model via the %O% notation.

mvkorpel commented 8 years ago

In my opinion, that's not "fiddling" with party but being prepared for assumptions made in the "internal" (but exported) functions of party, called by mboost code. When using such obviously dangerous functions, one should take every precaution. Quote from ?party_intern:

This function must not be called under any circumstances.

I do realize that my example call to blackboost() contains an unexpected or undesired family = Multinomial(). That's also what the caret example ends up using. The response to a user or another package doing something inappropriate or unexpected on the R side (as opposed to compiled code) should be a helpful error message, not a crash. If the crash can be solved by other means than what you refer to as fiddling or a "don't do that", then great.

I think the bottom line is that party_intern() is a loaded gun and mboost is being careless with it.

tobigithub commented 8 years ago

Very interesting, the data structure input (char, vector, df, int, nested factors) can also contribute to a number of issues. Thanks for all the contributions. Tobias

hofnerb commented 8 years ago

Dear @mvkorpel, thank you for your comments. Yet, as said earlier, we have to throw an error in case a user uses family Multinomial in an unsuported function.

Using party is absolutely ok. It is also ok to use the party_intern function. This function was only ccreated to keep mboost working. The reasons are as follows:

Btw. we have a similar function mboost_intern to make a package of one of our collegues work: https://github.com/boost-R/mboost/blob/master/R/mboost_intern.R

So, mboost is by no means being careless but mboost is the only package supposed to use this function and the usage is as intended! For now, please lets stop this discussion and move over to fix the real issue #46 which is the actual problem.

Btw. is caret really using Multinomial in the above example? I asked this above but @tobigithub didn't answer to this question. How is caret taking care of all the options and how do I know what it assumes as default?

mvkorpel commented 8 years ago

@hofnerb Ok, I think we can agree to disagree (a bit) on the carelessness. Particularly, I agree it's fine to share some of the internals between related packages, and that it's best to stop as early as possible when an inappropriate combination of arguments is used (but it may be non-trivial as you wrote). Meanwhile, if you wish, or until something better comes up, you can view my suggested check code as an "if all else fails" safety device.

Here is a debug log showing that caret uses Multinomial in the example.

hofnerb commented 8 years ago

Thank's a lot. This is good to know. I'll have a look at your patch and think it might be worthwile to apply it to prevent the above issue as you said.

@tobigithub How is caret determining all options? Is there a documentation (especially regarding caret with mboost)?

tobigithub commented 8 years ago

Hi, @hofnerb I think the parameters are modified or passed on (S4/S3) but I am not really an expert. See the example routine for caret blackboost.

That would be better to ask in the official caret forum https://github.com/topepo/caret/issues they are very helpful/knowledgeable.

Some of the other 220 wrapper functions can be found here: 220 caret wrapper functions. Tobias

mvkorpel commented 8 years ago

The caret vignette, available online or by vignette("caret"), points to the main online documentation of the package. The structure of the information stored by caret about each learning method is documented here, under the heading "Model Components". For example, this is the blackboost fitting function that uses Multinomial(), in caret version 6.0-71:

> getModelInfo("blackboost", regex = FALSE)[[1]][["fit"]]
function(x, y, wts, param, lev, last, classProbs, ...) {
                    theDots <- list(...)

                    if(any(names(theDots) == "tree_controls")) {
                      theDots$tree_controls$maxdepth <- param$maxdepth
                      treeCtl <- theDots$tree_controls
                      theDots$tree_controls <- NULL

                    } else treeCtl <- ctree_control(maxdepth = param$maxdepth)

                    if(any(names(theDots) == "control")) {
                      theDots$control$mstop <- param$mstop
                      ctl <- theDots$control
                      theDots$control <- NULL

                    } else ctl <- boost_control(mstop = param$mstop)

                    if(!any(names(theDots) == "family")) {
                      if(is.factor(y)) {
                        theDots$family <- if(length(lev) == 2) Binomial() else Multinomial()
                        } else theDots$family <- GaussReg()
                    }

                    ## pass in any model weights
                    if(!is.null(wts)) theDots$weights <- wts

                    modelArgs <- c(list(formula = as.formula(".outcome ~ ."),
                                        data = if(!is.data.frame(x)) as.data.frame(x) else x,
                                        control = ctl,
                                        tree_controls = treeCtl),
                                   theDots)
                    modelArgs$data$.outcome <- y

                    out <- do.call("blackboost", modelArgs)
                    out$call["data"] <- "data"
                    out
                  }

The model information for blackboost has remained the same since caret 6.0-64, released on January 6, 2016. It seems that prior versions did not use Multinomial().

This README explains how the model code files (.R) are processed into the combined models.RData file shipped with the caret package. The contents of this file can be accessed with the getModelInfo() convenience function, as above.