ecpolley / SuperLearner

Current version of the SuperLearner R package
272 stars 72 forks source link

NNloglik and CV.SuperLearner returning wrong SL predictions #109

Open ecpolley opened 6 years ago

ecpolley commented 6 years ago

The combination of 'CV.SuperLearner' with 'NNloglik' method appears to be shuffling the `SL.predict' values in the output. Example below:

library(SuperLearner)
set.seed(23432)
## training set
n <- 50
p <- 10
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
colnames(X) <- paste("X", 1:p, sep="")
X <- data.frame(X)
Y <- X[, 1] + 5*sqrt(abs(X[, 2] * X[, 3])) + as.numeric(X[, 2] > 1) - as.numeric(X[, 3] < -1) + 2*as.numeric(X[, 3] > 0) + rnorm(n)

## build Library and run Super Learner
SL.library <- c("SL.glm", "SL.gam", "SL.glmnet", "SL.lm")
test <- CV.SuperLearner(Y = Y, X = X, SL.library = SL.library, method = "method.NNLS", cvControl = list(V= n), innerCvControl = list(list(V=n-1)))
summary(test)
coef(test)

cor(test$SL.predict, test$library.predict)
cbind(test$SL.predict, test$library.predict[, 2])
plot(test$SL.predict, test$library.predict[, 2])

Y2 <- as.numeric(Y > 5)
SL.library <- c("SL.glm", "SL.randomForest", "SL.glmnet")
test <- CV.SuperLearner(Y = Y2, X = X, SL.library = SL.library, method = "method.NNLS", family = binomial(), cvControl = list(V= n), innerCvControl = list(list(V=n-1)))
summary(test)
coef(test)

cor(test$SL.predict, test$library.predict)
cbind(test$SL.predict, test$library.predict[, 2])
plot(test$SL.predict, test$library.predict[, 2])

## build Library and run Super Learner
Y2 <- as.numeric(Y > 5)
SL.library <- c("SL.glm", "SL.randomForest", "SL.glmnet")
test <- CV.SuperLearner(Y = Y2, X = X, SL.library = SL.library, method = "method.NNloglik", family = binomial(), cvControl = list(V= n), innerCvControl = list(list(V=n-1)))
summary(test)
coef(test)

cor(test$SL.predict, test$library.predict)
cbind(test$SL.predict, test$library.predict[, 2]) # why did rows for the SL.predict get scrambled?
plot(test$SL.predict, test$library.predict[, 2])

# try SuperLearner directly
test <- SuperLearner(Y = Y2, X = X, SL.library = SL.library, method = "method.NNLS", family = binomial(), cvControl = list(V= n))
summary(test)
coef(test)

cor(test$SL.predict, test$library.predict)
cbind(test$SL.predict, test$library.predict[, 2])

test <- SuperLearner(Y = Y2, X = X, SL.library = SL.library, method = "method.NNloglik", family = binomial(), cvControl = list(V= n))
summary(test)
coef(test)

cor(test$SL.predict, test$library.predict)
cbind(test$SL.predict, test$library.predict[, 2])