Stuff for Course 5 (Reproducible Research) in the Data Sciences Specialization at Coursera
From a week-1 lecture, Structure of a Data Analysis (part 2):
trainSpam$numType <- as.numeric(trainSpam$type) - 1
costFunction <- function(x, y) sum(x != (y > 0.5))
cvError <- rep(NA, 55)
library(boot) # Needed for cv.glm function
for (i in 1:55) {
lmFormula <- reformulate( names(trainSpam)[i], response = "numType" )
glmFit <- glm( lmFormula, family = "binomial", data = trainSpam )
cvError[i] <- cv.glm( trainSpam, glmFit, costFunction, 2)$delta[2]
}
### Which predictor has minimum cross-validated error?
names(trainSpam)[which.min(cvError)]
predictionModel <- glm( numType ~ charDollar, family = "binomial",
data = trainSpam)
predictionTest <- predict(predictionModel, testSpam)
predictedSpam <- rep("nonspam", dim(testSpam)[1])
predictedSpam[predictionModel$fitted > 0.5] <- "spam"
table(predictedSpam, testSpam$type)