mboost_2.6-0 and caret_6.0-70 under WINDOWS R3.3.1 hard crash

tobigithub commented 8 years ago

Hard crash with memory bug and rscript termination under Windows. Example:

require(caret)
require(mboost)
require(gbm)
require(rf)

# load iris set
data(iris) 
dim(iris) 

# works 
m <- c("rf" ,"gbm")

# load X and Y (this will be transferred to to train function)
X = iris[,1:3]
Y = iris$Species

# this setup actually calls the caret::train function, in order to provide
# minimal error handling this type of construct is needed.
trainCall <- function(i) 
    {
         cat("----------------------------------------------------","\n");
         set.seed(123); cat(i," <- loaded\n");
         return(tryCatch(
                t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "cv")),
                error=function(e) NULL))
    }

# use lapply/loop to run everything, required for try/catch error function to work
t2 <- lapply(m, trainCall)

#remove NULL values, we only allow succesful methods, provenance is deleted.
t2 <- t2[!sapply(t2, is.null)]

# this setup extracts the results with minimal error handling 
# TrainKappa can be sometimes zero, but Accuracy SD can be still available
# see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html
printCall <- function(i) 
    {
         return(tryCatch(
            {
             cat(sprintf("%-22s",(m[i])))
         cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t")
         cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t")
         cat(t2[[i]]$times$everything[3],"\n")},
             error=function(e) NULL))
    }

r2 <- lapply(1:length(t2), printCall)

#-------------------------------------------------------------------------------------

# crashes using blackboost 
m <- c("rf" ,"gbm" ,"blackboost")

# load X and Y (this will be transferred to to train function)
X = iris[,1:3]
Y = iris$Species

# this setup actually calls the caret::train function, in order to provide
# minimal error handling this type of construct is needed.
trainCall <- function(i) 
    {
         cat("----------------------------------------------------","\n");
         set.seed(123); cat(i," <- loaded\n");
         return(tryCatch(
                t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "cv")),
                error=function(e) NULL))
    }

# use lapply/loop to run everything, required for try/catch error function to work
t2 <- lapply(m, trainCall)

#remove NULL values, we only allow succesful methods, provenance is deleted.
t2 <- t2[!sapply(t2, is.null)]

# this setup extracts the results with minimal error handling 
# TrainKappa can be sometimes zero, but Accuracy SD can be still available
# see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html
printCall <- function(i) 
    {
         return(tryCatch(
            {
             cat(sprintf("%-22s",(m[i])))
         cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t")
         cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t")
         cat(t2[[i]]$times$everything[3],"\n")},
             error=function(e) NULL))
    }

r2 <- lapply(1:length(t2), printCall)

> sessionInfo()
R version 3.3.1 (2016-06-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

locale:
[1] LC_COLLATE=English_United States.1252  
[4] LC_NUMERIC=C                        

attached base packages:
 [1] splines   parallel  stats4    grid      stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] gbm_2.1.1           survival_2.39-4     ada_2.0-5           rpart_4.1-10        DT_0.1              randomForest_4.6-12
 [7] doParallel_1.0.10   iterators_1.0.8     foreach_1.4.3       plyr_1.8.4          mboost_2.6-0        stabs_0.5-1        
[13] party_1.0-25        strucchange_1.5-1   sandwich_2.3-4      zoo_1.7-13          modeltools_0.2-21   mvtnorm_1.0-5      
[19] caret_6.0-70        ggplot2_2.1.0       lattice_0.20-33    

loaded via a namespace (and not attached):
 [1] coin_1.1-2         reshape2_1.4.1     colorspace_1.2-6   htmltools_0.3.5    mgcv_1.8-12        e1071_1.6-7       
 [7] nloptr_1.0.4       multcomp_1.4-5     stringr_1.0.0      MatrixModels_0.4-1 munsell_0.4.3      gtable_0.2.0      
[13] htmlwidgets_0.6    codetools_0.2-14   SparseM_1.7        quantreg_5.26      pbkrtest_0.4-6     class_7.3-14      
[19] TH.data_1.0-7      Rcpp_0.12.5        scales_0.4.0       lme4_1.1-12        digest_0.6.9       stringi_1.1.1     
[25] quadprog_1.5-5     tools_3.3.1        magrittr_1.5       car_2.1-2          MASS_7.3-45        Matrix_1.2-6      
[31] pROC_1.8           nnls_1.4           minqa_1.2.4        nnet_7.3-12        nlme_3.1-128       compiler_3.3.1

Tobias

hofnerb commented 8 years ago

Thanks, @tobigithub for the bug report. However, I cannot reproduce your hard crash (nor can I obtain results for random forest or gbm). The list t2 is always "empty", i.e., consists only of NULL.

Furthermore, I'd guess that this is a bug related to the caret interface?! (Please use plain mboost code to show bugs in mboost. We do not really know nor control what caret does.)

> session_info()
Session info -------------------------------------------------------------------------------------------------------
 setting  value                                      
 version  R version 3.2.4 Revised (2016-03-16 r70336)
 system   x86_64, mingw32                            
 ui       RStudio (0.99.893)                         
 language (EN)                                       
 collate  German_Germany.1252                        
 tz       Europe/Berlin                              
 date     2016-07-26                                 

Packages -----------------------------------------------------------------------------------------------------------
 package      * version date       source                         
 car            2.1-2   2016-03-25 CRAN (R 3.2.5)                 
 caret        * 6.0-70  2016-06-13 CRAN (R 3.2.5)                 
 codetools      0.2-14  2015-07-15 CRAN (R 3.2.4)                 
 coin           1.1-2   2015-11-16 CRAN (R 3.2.5)                 
 colorspace     1.2-6   2015-03-11 CRAN (R 3.2.5)                 
 devtools     * 1.10.0  2016-01-23 CRAN (R 3.2.4)                 
 digest         0.6.9   2016-01-08 CRAN (R 3.2.4)                 
 foreach        1.4.3   2015-10-13 CRAN (R 3.2.5)                 
 gbm          * 2.1.1   2015-03-11 CRAN (R 3.2.5)                 
 ggplot2      * 2.1.0   2016-03-01 CRAN (R 3.2.5)                 
 gtable         0.2.0   2016-02-26 CRAN (R 3.2.5)                 
 iterators      1.0.8   2015-10-13 CRAN (R 3.2.3)                 
 lattice      * 0.20-33 2015-07-14 CRAN (R 3.2.4)                 
 lme4           1.1-12  2016-04-16 CRAN (R 3.2.5)                 
 magrittr       1.5     2014-11-22 CRAN (R 3.2.5)                 
 MASS           7.3-45  2015-11-10 CRAN (R 3.2.4)                 
 Matrix         1.2-4   2016-03-02 CRAN (R 3.2.4)                 
 MatrixModels   0.4-1   2015-08-22 CRAN (R 3.2.5)                 
 mboost       * 2.6-0   2016-07-04 Github (boost-R/mboost@b893002)
 memoise        1.0.0   2016-01-29 CRAN (R 3.2.4)                 
 mgcv           1.8-12  2016-03-03 CRAN (R 3.2.4)                 
 minqa          1.2.4   2014-10-09 CRAN (R 3.2.5)                 
 modeltools     0.2-21  2013-09-02 CRAN (R 3.2.3)                 
 multcomp       1.4-5   2016-05-04 CRAN (R 3.2.5)                 
 munsell        0.4.3   2016-02-13 CRAN (R 3.2.5)                 
 mvtnorm        1.0-5   2016-02-02 CRAN (R 3.2.3)                 
 nlme           3.1-125 2016-02-27 CRAN (R 3.2.4)                 
 nloptr         1.0.4   2014-08-04 CRAN (R 3.2.5)                 
 nnet           7.3-12  2016-02-02 CRAN (R 3.2.4)                 
 nnls           1.4     2012-03-19 CRAN (R 3.2.3)                 
 party          1.0-25  2015-11-05 CRAN (R 3.2.5)                 
 pbkrtest       0.4-6   2016-01-27 CRAN (R 3.2.5)                 
 plyr         * 1.8.3   2015-06-12 CRAN (R 3.2.5)                 
 quadprog       1.5-5   2013-04-17 CRAN (R 3.2.3)                 
 quantreg       5.26    2016-06-07 CRAN (R 3.2.5)                 
 randomForest * 4.6-12  2015-10-07 CRAN (R 3.2.5)                 
 Rcpp           0.12.5  2016-05-14 CRAN (R 3.2.5)                 
 reshape2       1.4.1   2014-12-06 CRAN (R 3.2.5)                 
 sandwich       2.3-4   2015-09-24 CRAN (R 3.2.5)                 
 scales         0.4.0   2016-02-26 CRAN (R 3.2.5)                 
 SparseM        1.7     2015-08-15 CRAN (R 3.2.3)                 
 stabs        * 0.5-2   2016-05-17 Github (hofnerb/stabs@4d0d759) 
 stringi        1.0-1   2015-10-22 CRAN (R 3.2.3)                 
 stringr        1.0.0   2015-04-30 CRAN (R 3.2.5)                 
 strucchange    1.5-1   2015-06-06 CRAN (R 3.2.5)                 
 survival     * 2.38-3  2015-07-02 CRAN (R 3.2.4)                 
 TH.data        1.0-7   2016-01-28 CRAN (R 3.2.5)                 
 zoo            1.7-13  2016-05-03 CRAN (R 3.2.5)

tobigithub commented 8 years ago

Hi, totally agree, it could be a DLL dependency issue, packet interference or caret itself, sometimes its also a class imbalance, too few data points. I just confirmed it again, once you include blackboost to be run, the following memory error occurs. I also run it in sequential mode, same thing. There are a bunch of packages masked, maybe that is the issue. I simplified it to a one liner that crashes.

# works
require(caret); X = iris[,1:3]; Y = iris$Species; train(y=Y, x=X, "knn")

# crashes
require(mboost); X = iris[,1:3]; Y = iris$Species; train(y=Y, x=X, "blackboost")

blackboost-error

Interestingly there is no issue with the PimaIndians set. Here the blackboost method runs, ranking 83 from 128 machine learning packages caret-all-binary-class-PimaIndiansDiabetes. The code is here caret-all-binary-class-PimaIndiansDiabetes.R. So I suspect it has something to do with the data structure that invokes the error.

Best way to test it would be to use a Virtual Machine and update to the latest package and then run the code below to confirm and then start debugging on the code level. I have no simple blackboost code, only the list of around 100 packages that run with the same code. This script from here will create that: iris-classification-all-fast.R One other way beyond unit testing would be probably to run the whole UCI ML archive, but surely that's a lot of work. I remember blackboost worked in older releases, so I will just wait for the next releases.

Num	Name	Accuracy	Kappa	time [s]	Model name
97	vglmCumulative	0.968	0.952	2.160	Cumulative Probability Model for Ordinal Data
2	avNNet	0.968	0.951	1.870	Model Averaged Neural Network
96	vglmContRatio	0.966	0.949	2.320	Continuation Ratio Model for Ordinal Data
79	sda	0.966	0.949	1.360	Shrinkage Discriminant Analysis
13	CSimca	0.966	0.948	1.730	SIMCA
41	loclda	0.965	0.948	1.910	Localized Linear Discriminant Analysis
72	rlda	0.965	0.947	1.380	Regularized Linear Discriminant Analysis
38	lda2	0.965	0.947	0.900	Linear Discriminant Analysis
60	pda	0.965	0.947	1.030	Penalized Discriminant Analysis
6	Boruta	0.964	0.946	3.280	Random Forest with Additional Feature Selection
94	treebag	0.964	0.946	1.430	Bagged CART
1	amdai	0.964	0.945	0.830	Adaptive Mixture Discriminant Analysis
31	hdrda	0.964	0.945	1.750	High-Dimensional Regularized Discriminant Analysis
37	lda	0.964	0.945	1.030	Linear Discriminant Analysis
69	rda	0.964	0.945	1.670	Regularized Discriminant Analysis
70	rf	0.964	0.945	1.370	Random Forest
20	extraTrees	0.962	0.943	2.810	Random Forest by Randomization
52	nnet	0.962	0.943	1.430	Neural Network
56	parRF	0.962	0.943	1.310	Parallel Random Forest
95	vglmAdjCat	0.960	0.940	2.920	Adjacent Categories Probability Model for Ordinal Data
78	RSimca	0.960	0.940	1.130	Robust SIMCA
45	mda	0.960	0.939	1.200	Mixture Discriminant Analysis
25	gbm	0.959	0.939	1.260	Stochastic Gradient Boosting
67	ranger	0.959	0.938	1.170	Random Forest
68	rbfDDA	0.959	0.938	1.950	Radial Basis Function Network
53	oblique.tree	0.959	0.938	2.450	Oblique Trees
87	svmLinear	0.959	0.938	1.030	Support Vector Machines with Linear Kernel
88	svmLinear2	0.959	0.938	1.370	Support Vector Machines with Linear Kernel
59	pcaNNet	0.958	0.937	1.460	Neural Networks with Feature Extraction
27	glmnet	0.958	0.937	1.540	glmnet
30	hdda	0.956	0.933	1.100	High Dimensional Discriminant Analysis
39	Linda	0.955	0.932	1.090	Robust Linear Discriminant Analysis
21	fda	0.955	0.932	1.040	Flexible Discriminant Analysis
3	bagFDAGCV	0.954	0.931	3.290	Bagged FDA using gCV Pruning
40	LMT	0.954	0.930	2.770	Logistic Model Trees
65	qda	0.953	0.930	0.980	Quadratic Discriminant Analysis
33	JRip	0.953	0.929	1.810	Rule-Based Classifier
99	wsrf	0.952	0.928	1.240	Weighted Subspace Random Forest
36	knn	0.952	0.928	1.090	k-Nearest Neighbors
42	LogitBoost	0.952	0.927	1.260	Boosted Logistic Regression
18	earth	0.951	0.926	1.140	Multivariate Adaptive Regression Spline
100	xyf	0.951	0.926	2.090	Self-Organizing Maps
26	gcvEarth	0.951	0.926	1.230	Multivariate Adaptive Regression Splines
86	stepQDA	0.951	0.926	2.260	Quadratic Discriminant Analysis with Stepwise Feature Selection
9	C5.0	0.950	0.924	1.100	C5.0
57	PART	0.949	0.923	1.200	Rule-Based Classifier
44	lvq	0.949	0.923	1.280	Learning Vector Quantization
85	stepLDA	0.948	0.921	1.940	Linear Discriminant Analysis with Stepwise Feature Selection
10	C5.0Rules	0.947	0.920	0.760	Single C5.0 Ruleset
75	rpart1SE	0.946	0.919	1.160	CART
76	rpart2	0.946	0.918	1.180	CART
12	cforest	0.946	0.918	2.450	Conditional Inference Random Forest
11	C5.0Tree	0.945	0.918	0.800	Single C5.0 Tree
54	OneR	0.945	0.917	0.990	Single Rule Classification
32	J48	0.945	0.917	1.860	C4.5-like Trees
74	rpart	0.945	0.917	1.000	CART
77	rpartScore	0.945	0.917	1.970	CART or Ordinal Responses
58	partDSA	0.944	0.916	3.310	partDSA
5	bdk	0.944	0.915	1.900	Self-Organizing Map
14	ctree	0.944	0.915	0.950	Conditional Inference Tree
15	ctree2	0.943	0.914	1.360	Conditional Inference Tree
29	hda	0.942	0.912	2.530	Heteroscedastic Discriminant Analysis
49	mlpWeightDecay	0.939	0.909	3.790	Multi-Layer Perceptron
50	mlpWeightDecayML	0.939	0.909	3.940	Multi-Layer Perceptron multiple layers
47	mlp	0.939	0.909	2.370	Multi-Layer Perceptron
48	mlpML	0.939	0.909	2.080	Multi-Layer Perceptron with multiple layers
66	QdaCov	0.939	0.908	1.110	Robust Quadratic Discriminant Analysis
89	svmPoly	0.938	0.907	2.510	Support Vector Machines with Polynomial Kernel
91	svmRadialCost	0.934	0.900	1.300	Support Vector Machines with Radial Basis Function Kernel
90	svmRadial	0.931	0.896	1.430	Support Vector Machines with Radial Basis Function Kernel
92	svmRadialSigma	0.931	0.896	1.630	Support Vector Machines with Radial Basis Function Kernel
93	svmRadialWeights	0.931	0.896	1.390	Support Vector Machines with Class Weights
43	lssvmRadial	0.928	0.891	3.040	Least Squares Support Vector Machine with Radial Basis Function Kernel
64	protoclass	0.924	0.885	1.250	Greedy Prototype Selection
35	kknn	0.919	0.878	1.240	k-Nearest Neighbors
19	elm	0.909	0.862	1.230	Extreme Learning Machine
24	gaussprRadial	0.906	0.859	2.950	Gaussian Process with Radial Basis Function Kernel
55	pam	0.904	0.855	1.120	Nearest Shrunken Centroids
61	PenalizedLDA	0.904	0.855	1.170	Penalized Linear Discriminant Analysis
51	nb	0.904	0.855	1.110	Naive Bayes
16	dda	0.890	0.835	2.980	Diagonal Discriminant Analysis
82	slda	0.838	0.755	1.270	Stabilized Linear Discriminant Analysis
84	spls	0.809	0.713	1.530	Sparse Partial Least Squares
34	kernelpls	0.800	0.699	0.930	Partial Least Squares
63	pls	0.800	0.699	0.980	Partial Least Squares
81	simpls	0.800	0.699	0.990	Partial Least Squares
98	widekernelpls	0.800	0.699	1.100	Partial Least Squares
46	Mlda	0.739	0.537	1.060	Maximum Uncertainty Linear Discriminant Analysis
71	RFlda	0.739	0.537	1.170	Factor-Based Linear Discriminant Analysis
83	sparseLDA	0.672	0.509	1.420	Sparse Linear Discriminant Analysis
4	bayesglm	0.671	0.504	1.030	Bayesian Generalized Linear Model
8	bstSm	0.671	0.504	1.840	Boosted Smoothing Spline
22	gam	0.671	0.504	2.660	Generalized Additive Model using Splines
23	gamLoess	0.671	0.504	1.040	Generalized Additive Model using LOESS
62	plr	0.671	0.504	1.080	Penalized Logistic Regression
73	rocc	0.671	0.504	1.550	ROC-Based Classifier
80	sdwd	0.671	0.504	1.980	Sparse Distance Weighted Discrimination
7	BstLm	0.587	0.383	1.970	Boosted Linear Model
17	dnn	0.326	0.000	2.340	Stacked AutoEncoder Deep Neural Network
28	gpls	1.000	0.000	2.110	Generalized Partial Least Squares

hofnerb commented 8 years ago

Thanks, Tobias, for your updated bug report. I now could recreate your error, yet I do not easily see were it comes from.

Thus, I tried to get iris classification working in mboost directly:

blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, 
           data = iris, family = Binomial())
  Error in family@check_y(y) : 
  response is not a factor at two levels but ‘family = Binomial()’

Similarly if I try family = PropOdds() I get another error:

 Error in family@check_y(y) : response must be an ordered factor

So my main issue is that I do not know HOW (i.e. with which code and which options) caret calls mboost. There is also no matrix interface for blackboost, only a formula interface. Hence, per default, without manipulating the data the model cannot be fitted.

If you'd (wrongly) fit a linear model to a continuous outcome, it works:

set.seed(1234)
## fit model
mod <- blackboost(as.numeric(Species) ~ Sepal.Length + Sepal.Width + Petal.Length, 
                  data = iris)
## cross-validate tuning parameter
cvr <- cvrisk(mod)
## optimal number of boosting steps
mstop(cvr)
[1] 42
## I didn't trick this. So a nice answer ;)

## now set you model to the optimal mstop value
mstop(mod) <- mstop(cvr)

## plot the predictions
plot(iris$Species, fitted(mod))

predictions_iris

Although I consider the idea of a single interface very attractive, I think that you lose a lot of fine control. E.g. , the user needs to prespecify a maximum mstop value for mboost methods. Per default 100 is used. Yet this might be a very poor choice. Thus, I do not know what caret uses here. If chosen to low this might greately hamper boosting models.

Furthermore, the description of boosting tuning parameters at http://topepo.github.io/caret/Boosting.html is not correct. There is no such thing as prune, neither for gamboost nor for glmboost but there are other (less important) tuning parameters.

Finally, mboost is mainly about linear, additive and other complex models which are to be fitted. blackboost is just there to complete the range of models. Actually, that is not the main focus or strength of the package.

tobigithub commented 8 years ago

Hi, its an intrinsic R problem, and surely the error handling of R code sucks. Cheers Tobias

mvkorpel commented 8 years ago

@hofnerb The following is an example of how to crash R with plain mboost, without caret.

> library(mboost)
Loading required package: parallel
Loading required package: stabs
This is mboost 2.6-0. See ‘package?mboost’ and ‘news(package  = "mboost")’
for a complete list of changes.

> blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris, family = Multinomial())

 *** caught segfault ***
address 0xe10000a6, cause 'memory not mapped'

An alternative error message I get is:

 *** caught segfault ***
address (nil), cause 'unknown'

The previous errors were obtained on Linux using recent development and patched versions of R. With an older R version (3.0.2):

> blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris, family = Multinomial())
Error in R_modify_response(c(0.666666666666667, 0.666666666666667, 0.666666666666667,  : 
  REAL() can only be applied to a 'numeric', not a 'bytecode'
> blackboost(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris, family = Multinomial())
*** Error in `/usr/lib/R/bin/exec/R': free(): invalid next size (normal): 0x00000000091fe380 ***
Aborted

I did some digging with print statements and the valgrind debugger. As a result I found out that in this example, it appears that the C function R_modify_response() in the party package tries to write 300 numeric values to an array that can only hold 150, which seems to make a mess of the R internals. This happens in each of the four for loops of the function. Note that 150 is the number of samples in the iris data set.

As mboost calls some internal functions of party, i.e. party::party_intern() (as opposed to only using high level interfaces), I dare not say where the root of the problem is.

hofnerb commented 8 years ago

Without beeing a real expert on Multinomial() and on how exactly party functions are implemented, I think the error is that you tried using family = Multinomial() with something else than gamboost() (or equivalently the mboost() function). From the manual (?Multinomial) you see that there is some extra work to do to fit multinomial models:

### Multinomial logit model via a linear array model
## One needs to convert the data to a list
myiris <- as.list(iris)
## ... and define a dummy vector with one factor level less
## than the outcome, which is used as reference category.
myiris$class <- factor(levels(iris$Species)[-nlevels(iris$Species)])
## Now fit the linear array model
mlm <- mboost(Species ~ bols(Sepal.Length, df = 2) %O%
                        bols(class, df = 2, contrasts.arg = "contr.dummy"),
              data = myiris,
              family = Multinomial())
coef(mlm) ## one should use more boosting iterations.

This model cannot be properly depicted in blackboost() and glmboost(). Hence, we should actually try to check wether Multinomial was used within gamboost or mboost with the correct model formula. However, this might be non-trivial.

mvkorpel commented 8 years ago

As a low-level fix, I think the R function fitfun() (R/btree.R) could check the lengths and types of y and various slots of object@responses, which requires knowing how party::party_intern(..., fun = "R_modify_response") works. Technically, to avoid overwriting unintended parts of the memory, it is only required that each of the other vectors is at least as long as y, but I think the intent is that they should be the same length. The check would be something like the following code:

obr <- object@responses
party_arrays <- list(y, obr@transformations[[1]], obr@test_trafo,
                     obr@predict_trafo, obr@variables[[1]])
if (!all(vapply(party_arrays, is.numeric, FALSE)) ||
    length(unique(vapply(party_arrays, length, 0))) != 1) {
    stop("some error message")
}

hofnerb commented 8 years ago

Again, boosted trees are not accepable here! Hence no need to fiddle around with party. One needs special data sets and it is absolutely required to use the linear array model via the %O% notation.

mvkorpel commented 8 years ago

In my opinion, that's not "fiddling" with party but being prepared for assumptions made in the "internal" (but exported) functions of party, called by mboost code. When using such obviously dangerous functions, one should take every precaution. Quote from ?party_intern:

This function must not be called under any circumstances.

I do realize that my example call to blackboost() contains an unexpected or undesired family = Multinomial(). That's also what the caret example ends up using. The response to a user or another package doing something inappropriate or unexpected on the R side (as opposed to compiled code) should be a helpful error message, not a crash. If the crash can be solved by other means than what you refer to as fiddling or a "don't do that", then great.

I think the bottom line is that party_intern() is a loaded gun and mboost is being careless with it.

tobigithub commented 8 years ago

Very interesting, the data structure input (char, vector, df, int, nested factors) can also contribute to a number of issues. Thanks for all the contributions. Tobias

hofnerb commented 8 years ago

Dear @mvkorpel, thank you for your comments. Yet, as said earlier, we have to throw an error in case a user uses family Multinomial in an unsuported function.

Using party is absolutely ok. It is also ok to use the party_intern function. This function was only ccreated to keep mboost working. The reasons are as follows:

mboost used to be maintained by Torsten Horthorn who is also the author of party
Packages sharing the same maintainer (!) can use unexported functions from each other
When I became maintainer, we needed to find a way to still be able to use these functions without the need to export these functions. That is why we created the party_intern function.

Btw. we have a similar function mboost_intern to make a package of one of our collegues work: https://github.com/boost-R/mboost/blob/master/R/mboost_intern.R

So, mboost is by no means being careless but mboost is the only package supposed to use this function and the usage is as intended! For now, please lets stop this discussion and move over to fix the real issue #46 which is the actual problem.

Btw. is caret really using Multinomial in the above example? I asked this above but @tobigithub didn't answer to this question. How is caret taking care of all the options and how do I know what it assumes as default?

mvkorpel commented 8 years ago

@hofnerb Ok, I think we can agree to disagree (a bit) on the carelessness. Particularly, I agree it's fine to share some of the internals between related packages, and that it's best to stop as early as possible when an inappropriate combination of arguments is used (but it may be non-trivial as you wrote). Meanwhile, if you wish, or until something better comes up, you can view my suggested check code as an "if all else fails" safety device.

Here is a debug log showing that caret uses Multinomial in the example.

hofnerb commented 8 years ago

Thank's a lot. This is good to know. I'll have a look at your patch and think it might be worthwile to apply it to prevent the above issue as you said.

@tobigithub How is caret determining all options? Is there a documentation (especially regarding caret with mboost)?

tobigithub commented 8 years ago

Hi, @hofnerb I think the parameters are modified or passed on (S4/S3) but I am not really an expert. See the example routine for caret blackboost.

That would be better to ask in the official caret forum https://github.com/topepo/caret/issues they are very helpful/knowledgeable.

Some of the other 220 wrapper functions can be found here: 220 caret wrapper functions. Tobias

mvkorpel commented 8 years ago

The caret vignette, available online or by vignette("caret"), points to the main online documentation of the package. The structure of the information stored by caret about each learning method is documented here, under the heading "Model Components". For example, this is the blackboost fitting function that uses Multinomial(), in caret version 6.0-71:

> getModelInfo("blackboost", regex = FALSE)[[1]][["fit"]]
function(x, y, wts, param, lev, last, classProbs, ...) {
                    theDots <- list(...)

                    if(any(names(theDots) == "tree_controls")) {
                      theDots$tree_controls$maxdepth <- param$maxdepth
                      treeCtl <- theDots$tree_controls
                      theDots$tree_controls <- NULL

                    } else treeCtl <- ctree_control(maxdepth = param$maxdepth)

                    if(any(names(theDots) == "control")) {
                      theDots$control$mstop <- param$mstop
                      ctl <- theDots$control
                      theDots$control <- NULL

                    } else ctl <- boost_control(mstop = param$mstop)

                    if(!any(names(theDots) == "family")) {
                      if(is.factor(y)) {
                        theDots$family <- if(length(lev) == 2) Binomial() else Multinomial()
                        } else theDots$family <- GaussReg()
                    }

                    ## pass in any model weights
                    if(!is.null(wts)) theDots$weights <- wts

                    modelArgs <- c(list(formula = as.formula(".outcome ~ ."),
                                        data = if(!is.data.frame(x)) as.data.frame(x) else x,
                                        control = ctl,
                                        tree_controls = treeCtl),
                                   theDots)
                    modelArgs$data$.outcome <- y

                    out <- do.call("blackboost", modelArgs)
                    out$call["data"] <- "data"
                    out
                  }

The model information for blackboost has remained the same since caret 6.0-64, released on January 6, 2016. It seems that prior versions did not use Multinomial().

This README explains how the model code files (.R) are processed into the combined models.RData file shipped with the caret package. The contents of this file can be accessed with the getModelInfo() convenience function, as above.

boost-R / mboost

mboost_2.6-0 and caret_6.0-70 under WINDOWS R3.3.1 hard crash #44