Closed VeilleData closed 8 years ago
Hi again,
I tried to dive a bit deeper and I can't figure out why glm_stacking fails while gbm_stacking works. The code is the same, but predict fails to identify glmnet1 for the first, and works for the latter.
Maybe that is wierd enough to be an hint. :)
@VeilleData This error still occur if you update to caretEnsemble 2.0.0 from CRAN?
Hi.
I just tried and, as expected, it works when I give "normal" names, like glmnet, and fails for custom names, like glmnet1.
Here is a printscreen with the version of the package :
Not work "model_list_big" from "caretEnsemble-intro" ` model_list_big <- caretList( Class~., data=training, trControl=my_control, metric="ROC", methodList=c("glm", "rpart"), tuneList=list( rf1=caretModelSpec(method="rf", tuneGrid=data.frame(.mtry=2)), rf2=caretModelSpec(method="rf", tuneGrid=data.frame(.mtry=10), preProcess="pca"), nn=caretModelSpec(method="nnet", tuneLength=2, trace=FALSE) ) )
predict(model_list_big) rf rf.1 nnet glm rpart [1,] 0.182 0.204 0.487500666 5.829182e-11 0.2205882 [2,] 0.188 0.226 0.244237704 2.220446e-16 0.3333333 [3,] 0.234 0.182 0.637914031 8.098000e-11 0.3333333 [4,] 0.138 0.112 0.100435956 2.467826e-11 0.3333333 [5,] 0.224 0.148 0.193155423 2.220446e-16 0.2205882 [6,] 0.164 0.180 0.555640618 9.425793e-11 0.2205882 [7,] 0.104 0.166 0.286350538 2.220446e-16 0.2205882 [8,] 0.036 0.070 0.008511706 2.220446e-16 0.2205882 [9,] 0.124 0.184 0.042814486 2.220446e-16 0.2205882 [10,] 0.118 0.192 0.155571034 2.220446e-16 0.2205882 [11,] 0.186 0.112 0.019707441 2.909450e-11 0.2205882 [12,] 0.154 0.184 0.217652347 6.502066e-11 0.2205882 [13,] 0.236 0.138 0.312549933 2.220446e-16 0.9117647 [14,] 0.214 0.276 0.244357454 4.165557e-11 0.2205882 [15,] 0.074 0.062 0.003261455 2.220446e-16 0.2205882 [16,] 0.210 0.250 0.291384933 2.441685e-10 0.9117647 .............
predict(model_list_big, newdata=testing) rf rf.1 nnet glm rpart [1,] 0.474 0.362 0.308897984 1.000000e+00 0.2205882 [2,] 0.456 0.456 0.658783678 1.000000e+00 0.9117647 [3,] 0.590 0.478 0.990755566 1.000000e+00 0.9117647 [4,] 0.466 0.370 0.007519741 9.999325e-01 0.9117647 [5,] 0.592 0.544 0.985319403 1.000000e+00 0.3333333 [6,] 0.238 0.296 0.001028135 2.345760e-01 0.2205882 [7,] 0.482 0.438 0.615022370 2.220446e-16 0.9117647 [8,] 0.178 0.180 0.041048553 2.220446e-16 0.2205882 [9,] 0.160 0.152 0.014696072 2.220446e-16 0.2205882 ..........
greedy_ensemble <- caretEnsemble( _model_list_big, _ metric="ROC", trControl=trainControl( number=2, summaryFunction=twoClassSummary, classProbs=TRUE ))
predict(greedy_ensemble, newdata=testing, type="prob") Ошибка в eval(expr, envir, enclos) :объект 'rf1' не найден `
Full script to reproduce:
# Install the latest dev version of caretEnsemble
devtools::install_github('zachmayer/caretEnsemble')
# Load libraries
library("caretEnsemble")
library("caret")
library("mlbench")
library("pROC")
library("randomForest")
library("nnet")
library("arm")
# Setup the dataset and tuning
data(Sonar)
set.seed(107)
inTrain <- createDataPartition(y = Sonar$Class, p = .75, list = FALSE)
training <- Sonar[ inTrain,]
testing <- Sonar[-inTrain,]
my_control <- trainControl(
method="boot",
number=25,
savePredictions="final",
classProbs=TRUE,
index=createResample(training$Class, 25),
summaryFunction=twoClassSummary
)
# Fit a model list with custom names
model_list_big <- caretList(
Class~., data=training,
trControl=my_control,
metric="ROC",
methodList=c("glm", "rpart"),
tuneList=list(
rf1=caretModelSpec(method="rf", tuneGrid=data.frame(.mtry=2)),
rf2=caretModelSpec(method="rf", tuneGrid=data.frame(.mtry=10), preProcess="pca"),
nn=caretModelSpec(method="nnet", tuneLength=2, trace=FALSE)
)
)
# caretEnsemble
greedy_ensemble <- caretEnsemble(
model_list_big,
metric="ROC",
trControl=trainControl(
number=2,
summaryFunction=twoClassSummary,
classProbs=TRUE
))
summary(greedy_ensemble)
p <- predict(greedy_ensemble, Sonar)
# caretList
bayes_ensemble <- caretStack(
model_list_big,
method='bayesglm',
metric="ROC",
trControl=trainControl(
number=2,
summaryFunction=twoClassSummary,
classProbs=TRUE
))
summary(bayes_ensemble)
p <- predict(bayes_ensemble, Sonar)
I can confirm this is now effecting my models as well.
Should be fixed by #198
Confirmed fixed. Thanks @eric-czech
Hi,
It looks like caretList > tuneList has an unexpected behaviour when given names that are not the name of the method used. For example, if I train a glmnet named glmnet1. The model is trained and works, but when I stack the models with caretStack and proceed to predict, an error is raised saying it can't find "glmnet1".
Here is a sample code that raises the issue. Sorry, I could not make it smaller.
`
Load data (iris)
require(data.table) data(iris) iris <- data.table(iris) iris[, Species := factor(ifelse(Species == "setosa", "SETOSA", sample(c("SETOSA", "OTHERS"), 200, replace = T)))]
require(caret) set.seed(42) trainIndex <- createDataPartition(iris$Species, p = .8, list=F)
train <- iris[trainIndex] test <- iris[-trainIndex]
Create individual models
my_control <- trainControl( method="cv", number=3, index=createResample(train$Species, 3), savePredictions="final", classProbs=TRUE, summaryFunction=twoClassSummary )
pp.cs <- c("center", "scale")
model_list <- caretList( Species~., data=train, trControl=my_control, metric="ROC", tuneList=list( glmnet=caretModelSpec(method="glmnet", preProcess=pp.cs, tuneGrid=expand.grid(.alpha = c(0.9), .lambda = c(0.00005))), gbm=caretModelSpec(method="gbm", preProcess=pp.cs,
tuneGrid=expand.grid(.interaction.depth = c(2), .n.trees = c(300), .shrinkage = c(0.01), .n.minobsinnode = c(1))), rf=caretModelSpec(method="rf", preProcess=pp.cs, tuneGrid= expand.grid(.mtry = c(round(sqrt(ncol(train)))))) ) )
Ensemble
greedy_ensemble <- caretEnsemble( model_list, metric="ROC", trControl=trainControl( number=2, summaryFunction=twoClassSummary, classProbs=TRUE )) summary(greedy_ensemble)
Stacking
glm_ensemble <- caretStack( model_list, method="glm", metric="ROC", trControl=trainControl( method="cv", number=3, savePredictions="final", classProbs=TRUE, summaryFunction=twoClassSummary ) )
gbm_ensemble <- caretStack( model_list, method="gbm", metric="ROC", trControl=trainControl( method="cv", number=3, savePredictions="final", classProbs=TRUE, summaryFunction=twoClassSummary ) )
Predictions
library("caTools")
Predictions for "normal" models
model_preds <- lapply(model_list, predict, newdata=test, type="prob") model_preds <- lapply(model_preds, function(x) x[,"SETOSA"]) model_preds <- data.frame(model_preds)
Predictions for greedy ensemble
ens_preds <- predict(greedy_ensemble, newdata=test, type="prob") model_preds$ensemble <- ens_preds
Predictions for glm and gbm ensemble
model_preds$glm_stacking <- predict(glm_ensemble, newdata=test, type="prob") CF <- coef(glm_ensemble$ens_model$finalModel)[-1]
model_preds$gbm_stacking <- predict(gbm_ensemble, newdata=test, type="prob") CF <- coef(gbm_ensemble$ens_model$finalModel)[-1]
Compare all
caTools::colAUC(model_preds, test$Species)