Pima Indians Diabetes Data Set Classification Tree
Sys.time()
library(rpart) library(rpart.plot) library(caret)
if(!require('mlbench')) { install.packages('mlbench') library('mlbench') }
data(PimaIndiansDiabetes2) #Cleaned data set from R-Project.org summary(PimaIndiansDiabetes2) #Summary of data set
Diabetes.df <- PimaIndiansDiabetes2 Diabetesdf2 <- na.omit(Diabetes.df) #Remove NAs to fix issues with models head(Diabetesdf2) summary(Diabetesdf2) summary(Diabetesdf2$diabetes) #Number of participants without/with diabetes Sys.Date()
install.packages("corrplot") library(corrplot) install.packages("Hmisc") library("Hmisc") Sys.Date() Diabetes.cor <- round(cor(Diabetesdf2[1:8]),1) Diabetes.cor
set.seed(1) #To reproduce the same output of simulation studies
train.index <- sample(c(1:dim(Diabetesdf2)[1]), dim(Diabetesdf2)[1]0.6) #Sample data frame test.index <- sample(c(1:dim(Diabetesdf2)[1]), dim(Diabetes.df)[1]0.4) #Sample data frame
require(caTools) set.seed(3) sample = sample.split(Diabetesdf2$diabetes, SplitRatio = 0.75) train= subset(Diabetesdf2, sample==TRUE) test= subset(Diabetesdf2, sample== FALSE)
train.df <- Diabetesdf2[train.index,]
test.df <- Diabetesdf2[test.index,]
Train.Model <- glm(train$diabetes ~ ., family = binomial(link = "logit"), data = train) Sys.Date() summary(Train.Model) plot(Train.Model)
Sys.Date() Test.model <- glm(test$diabetes ~ ., family = binomial(link = "logit"), data = test) summary(Test.model) View(Test.model)
library(ggplot2) plot(Test.Model)
Train.pred <- predict(Train.Model, type = "link") Sys.Date() summary(Train.pred) Test.pred <- predict(Test.Model, type = "link") summary(Test.pred)
install.packages("ROCR") library("ROCR") Train.ROCR = prediction(train$diabetes, Diabetesdf2) Train.perf = performance(Train.ROCR, "tpr", "fpr")
table(test$diabetes, Test.pred > 0.5)
library(rpart) library(rpart.plot)
Diabetes.Tree.Train <- rpart(train$diabetes ~., data = train, method = "class", minbucket = 10) prp(Diabetes.Tree.Train)##view tree
Diabetes.Tree <- rpart(test$diabetes ~ ., data = test, method = "class", minbucket = 5) Diabetes.Tree prp(Diabetes.Tree)## View tree
Train.Tree.pred <- predict(Diabetes.Tree.Train, newdata = train, type = "class") table(Train.Tree.pred)
Sys.Date() Test.Tree.pred <- predict(Diabetes.Tree, newdata = test, type = "class")
table(Test.Tree.pred)
Sys.Date()
table(train$diabetes, Train.Tree.pred)
sensitivity.Train <- round(72/(26+72),2)
specificity.Train <- round(178/(178+18),2)
sprintf("Sensitivity at 0.7 threshold: %s", sensitivity.Train)
sprintf("Specificity at 0.7 threshold: %s", specificity.Train)
table(test$diabetes, Test.Tree.pred)
sensitivity.Test <- round(24/(24+8),2)
specificity.Test <- round(59/(59+7),2)
sprintf("Sensitivity at 0.7 threshold: %s", sensitivity.Test)
sprintf("Specificity at 0.7 threshold: %s", specificity.Test)