An introduction on how to use SVM in R (only two class classification)
## This code is to study SVM using package 'e1071'
## =====Create data ======================
# in order to perform classification rather than regression,
# we need to change y to factors
set.seed(1)
x <- matrix(rnorm(20 * 2), ncol = 2)
y <- c(rep(-1, 10), rep(1, 10))
x[y ==1, ] = x[y == 1] +1
plot(x, col = (3 - y))
data <- data.frame( x = x, y = as.factor(y))
## =====fitting model =====================
# the lower the cost is, the more support vectors we have
library(e1071)
svmfit1 <- svm(y ~., data = data, kernel = "linear", cost = 10, scale = FALSE)
svmfit2 <- svm(y ~., data = data, kernel = "linear", cost = 10, scale = FALSE)
svmfit3 <- svm(y ~., data = data, kernel = "linear", cost = 10, scale = FALSE)
# the support vectors are denoted by 'X', others are denoted by "'O'
plot(svmfit1, data) # when cost = 10, there are 7 support vectors
plot(svmfit2, data) # when cost = 1, there are 10 support vectors
plot(svmfit3, data) # when cost = .1, there are 16 support vectors
## =====Tuning =============================
# tunning is the cross-validation for svm
set.seed(1)
tune.out <- tune(svm, y ~., data = data, kernel = "linear",
ranges = list(cost = c(.001, .01, .1, 1, 5, 10, 100)))
# we found the best cost is .1
summary(tune.out)
# tunning stores the best model for us
bestmod <- tune.out$best.model
## =====prediction ==========================
xtest <- matrix(rnorm(20 * 2), ncol = 2)
ytest <- sample(c(1, -1), 20, replace = TRUE)
xtest[ytest == 1, ] <- xtest[ytest == 1, ] + 1
test <- data.frame(x = xtest, y = as.factor(ytest))
pred <- predict(bestmod, test)
xtabs(~ pred + ytest)
## ===== other types of kernel: radial =============
# we need to set kernel = "radial", and set the value of gamma
# we still need to set cost value
set.seed(1)
x <- matrix(rnorm(200 * 2), ncol = 2)
x[1:100, ] <- x[1:100, ] + 2
x[101:150, ] <- x[101:150, ] - 2
y <- c(rep(1, 150), rep(2, 50))
data <- data.frame(x = x, y = as.factor(y))
plot(x, col = y)
# setting the cost = 1, we found there are 7 observations that are misclassified
train <- sample(200, 100)
svmfit <- svm(y ~., data = data[train, ], kernel = "radial", gamma = 1, cost = 1)
plot(svmfit, data[train, ])
pred <- predict(svmfit, data[train, ])
table(pred = pred, true = y[train])
# increase the cost will fit data more precisely, but tend to cause overfitting
# setting the cost = 1e5, we found there are only 1 observation misclassified
# the decision boundary becomes more irregular
quartz()
svmfit <- svm(y ~., data = data[train, ], kernel = "radial", gamma = 1, cost = 1e5)
plot(svmfit, data[train, ])
pred <- predict(svmfit, data[train, ])
table(pred = pred, true = y[train])
## we need crossvalidation (tunning)
tune.out <- tune(svm, y ~., data = data[train, ], kernel = "radial",
ranges = list(cost = seq(.1, 2, by = 0.1),
gamma = seq(.05, .5, by = .05)))
# we found %13 testing data are misclassified
pred <- predict(tune.out$best.model, data[-train, ])
table(pred = pred, true = y[-train])
An introduction on how to use SVM in R (only two class classification)