## Statistical Learning in R

Statistical learning is a branch of machine learning in which computers find patterns without explicit programming and has a wide array of applications from speech recognition to sport analysis. In particular, supervised statistical learning is useful for regression or classification problems, for which we create a model based on training data and then test the model on new data. In this short course, we will explore the theory behind classification and also look at what packages R has to help us perform statistical learning. While a wide variety of methods exist, we will focus on the k-nearest neighbors algorithm and support vector machines.

Presenters: Katie Zagnoli and Ian Laga

Location: Ketchum IB87
Time: Monday, April 10, 5:00 - 7:00 PM

Register for course here

## Two R files for this course iris.classification.R and SVM.R:

iris.classification.R

# Look at data (part of R) ##############################################################
data(iris)
summary(iris)

# Investigate a few 2D plots ############################################################
plot(iris\$Sepal.Length,iris\$Sepal.Width, col=iris\$Species,pch=19,xlab='sepal length',ylab='sepal width',main='Iris species')
par(xpd=TRUE)
legend(7,5,unique(iris\$Species),col=1:length(iris\$Species),pch=19)

plot(iris\$Petal.Length,iris\$Petal.Width, col=iris\$Species,pch=19,xlab='petal length',ylab='petal width',main='Iris species')
par(xpd=TRUE)
legend(5.5,1,unique(iris\$Species),col=1:length(iris\$Species),pch=19)

# KNN ###################################################################################
# install.packages("class")
library(class)

# Normalize the data? Consider the range for each predictor:
summary(iris)
# This ensures that one feature does not dominate.
ir.nm <- as.data.frame(lapply(iris[1:4], scale))
summary(ir.nm)

# Divide data into Train and Test
set.seed(8)
test.ind <- sample(seq(1:dim(ir.nm)[1]),50)
ir.test <- ir.nm[test.ind,]                # test/ train sets only contain predictor variables
ir.train <- ir.nm[-test.ind,]
test.lab <- iris\$Species[test.ind]         # store correct species labels for testing and training sets
train.lab <- iris\$Species[-test.ind]
summary(test.lab)

# KNN
?knn
iris.pred <- knn(train=ir.train, test=ir.test, cl=train.lab, k=3)

# Contingency Table / Confusion Matrix  #################################################
table(test.lab, iris.pred)
# 2 misclassifications for set.seed(8), k=3

# Calculate Accuracy
mean(iris.pred == test.lab)

# Test Accuracy for different k values
acc <- NULL
for (i in 1:15) {
iris.pred <- knn(train=ir.train, test=ir.test, cl=train.lab, k=i)
acc[i] <- mean(iris.pred == test.lab)*100
}
plot(acc,xlab='K',ylab='accuracy [%]',main='Accuracy for KNN on Iris Species', pch=19,col=2)
#########################################################################################

# How does K after the decision boundary? ###############################################
# Based on code by Professor Will Kleiber

# Synthetic data:
set.seed(4)
V0.0 <- rnorm(40,mean=1,sd=0.5)
V0.1 <- rnorm(60,mean=0,sd=0.5)
V1.0 <- rnorm(40,mean=-1,sd=1)
V1.1 <- rnorm(60,mean=0,sd=1)
dat <- data.frame(class=rep(c(0,1),times=c(40,60)), V0=c(V0.0,V0.1), V1=c(V1.0,V1.1))
plot(V1~V0,data=dat,asp=1)   # set aspect ratio to 1 because v0 and v1 have different ranges
points(V1~V0,data=dat,subset=dat\$class==1,col="red",pch=19)
points(V1~V0,data=dat,subset=dat\$class==0,col="blue",pch=19)
par(xpd=TRUE)
legend(2,3.3,c('class 0', 'class 1'), pch=c(19,19),col=c('blue','red'))

# Locations at which to make predictions:
predict.grid <- as.matrix(expand.grid(seq(-2,3,length.out=60),seq(-3.5,2,length.out=60)))
dim(predict.grid)
points(predict.grid,col="green",pch=19,cex=0.1)

# Plot decision boundary for different values of k: (Hit Enter to see plots)
vec <- c(1,2,3,4,5,6,7,10,18,27,38,50,100)
for(k in vec){
plot(V1~V0,data=dat,asp=1,main=paste("# of nearest neighbors =",k))
points(V1~V0,data=dat,subset=dat\$class==1,col="red",pch=19)
points(V1~V0,data=dat,subset=dat\$class==0,col="blue",pch=19)

out <- knn(train=dat[,2:3],test=predict.grid,cl=dat[,1],k=k)     # uses knn function to predict, cl=dat[,1] - tells what classifications are
points(predict.grid[out==0,],col="springgreen",pch=19,cex=0.6)
points(predict.grid[out==1,],col="yellow",pch=19,cex=0.6)
}

SVM.R

install.packages("e1071")
library(e1071)
install.packages("ROCR")
library(ROCR)

set.seed(52)
dat <- iris[51:150,c(1,2,5)]
dat\$Species <- factor(dat\$Species)

plot(dat\$Sepal.Length~dat\$Sepal.Width, col="white")
points(dat[dat\$Species=="versicolor",2], dat[dat\$Species=="versicolor",1], col="blue")
points(dat[dat\$Species!="versicolor",2], dat[dat\$Species!="versicolor",1], col="red", pch=19)

train <- sample(1:dim(dat)[1], size=50, replace=FALSE)

dat.train <- dat[train,]
dat.test <- dat[-train,]

##
## SVM fit
##

## support vector classifier
fit.svc <- svm(Species~.,data=dat.train,kernel="linear",cost=1)
plot(fit.svc,data=dat.train)
# tuning
tune.out <- tune(svm,Species~.,data=dat.train,kernel="linear",
ranges=list(cost=seq(0.01,5,length.out=100)),probability=TRUE)
fit.svc <- tune.out\$best.model

fit.svc
plot(fit.svc,data=dat.train)

## support vector machine: polynomial
fit.svmp <- svm(Species~.,data=dat.train,kernel="polynomial",cost=10,degree=3)
plot(fit.svmp,data=dat.train)
# tuning
tune.out <- tune(svm,Species~.,data=dat.train,kernel="polynomial",degree=3,
ranges=list(cost=seq(0.01,5,length.out=100)),probability=TRUE)
fit.svmp <- tune.out\$best.model

fit.svmp
plot(fit.svmp,data=dat.train)

plot(fit.svmr,data=dat.train)
# tuning
ranges=list(cost=seq(0.01,5,length.out=100)),probability=TRUE)
fit.svmr <- tune.out\$best.model

fit.svmr
plot(fit.svmr,data=dat.train)

##
## Testing set performance
##

# classifier
tab <- table(true=dat.test[,"Species"],pred=predict(fit.svc,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"Species"],pred=predict(fit.svmp,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"Species"],pred=predict(fit.svmr,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

##############
## ROC Plots
##############

probs.svc <- predict(fit.svc,dat.test,probability=TRUE)
pred.svc <- prediction(attributes(probs.svc)\$probabilities[,2], dat.test\$Species)
perf.svc <- performance(pred.svc,"tpr","fpr")

probs.svmp <- predict(fit.svmp,dat.test,probability=TRUE)
pred.svmp <- prediction(attributes(probs.svmp)\$probabilities[,2], dat.test\$Species)
perf.svmp <- performance(pred.svmp,"tpr","fpr")

probs.svmr <- predict(fit.svmr,dat.test,probability=TRUE)
pred.svmr <- prediction(attributes(probs.svmr)\$probabilities[,2], dat.test\$Species)
perf.svmr <- performance(pred.svmr,"tpr","fpr")

plot(perf.svc,col="red")

######################################################
######################################################

rm(list=ls())

set.seed(54)
x <- matrix(rnorm(500*2),ncol=2)
y <- vector(length=500)
for(i in 1:500){
if(sqrt(x[i,1]^2+x[i,2]^2) < 1+rnorm(1, sd=0.25)){
y[i] = 1
}
}

plot(x[,1],x[,2], col="white")
points(x[y==0,1],x[y==0,2], col="black")
points(x[y==1,1],x[y==1,2], col="red", pch=19)

dat <- data.frame(X=x,y=as.factor(y))

train <- sample(1:dim(dat)[1], size=100, replace=FALSE)

dat.train <- dat[train,]
dat.test <- dat[-train,]

##
## SVM fit
##

## support vector classifier
fit.svc <- svm(y~.,data=dat.train,kernel="linear",cost=1)
plot(fit.svc,data=dat.train)
# tuning
tune.out <- tune(svm,y~.,data=dat.train,kernel="linear",
ranges=list(cost=seq(0.01,5,length.out=100)))
fit.svc <- tune.out\$best.model

fit.svc
plot(fit.svc,data=dat.train)

## support vector machine: polynomial
fit.svmp <- svm(y~.,data=dat.train,kernel="polynomial",cost=10,degree=3)
plot(fit.svmp,data=dat.train)
# tuning
tune.out <- tune(svm,y~.,data=dat.train,kernel="polynomial",degree=3,
ranges=list(cost=seq(0.01,5,length.out=100)))
fit.svmp <- tune.out\$best.model

fit.svmp
plot(fit.svmp,data=dat.train)

plot(fit.svmr,data=dat.train)
# tuning
ranges=list(cost=seq(0.01,5,length.out=100)))
fit.svmr <- tune.out\$best.model

fit.svmr
plot(fit.svmr,data=dat.train)

##
## Testing set performance
##

# classifier
tab <- table(true=dat.test[,"y"],pred=predict(fit.svc,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"y"],pred=predict(fit.svmp,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"y"],pred=predict(fit.svmr,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]