- 17th Nov 2022
- 06:03 am
# DIGIT RECOGNIZER #
rm(list = ls())
set.seed(1001)
setwd("~/Assignments/Job 1")
library(data.table)
library(FNN)
library(caret)
# Load data:
data <- as.matrix(data.frame(fread('data/train.csv')))
data <- data[seq(10,nrow(data),by = 10),]
y_label <- 'label'
## DATA PRE-PROCESSING:
# 01. Removing columns with zero-variance:
idx <- (apply(data, 2,function(x){sd(x)}) == 0) # columns having zero-variance
data <- data[,!idx] # removing columns having zero-variance
# 02. Use PCA to reduce dimension:
x = as.matrix(data[,-1])
y = as.factor(data[,y_label])
pca_data = scale(x)
pca = prcomp(pca_data)
cumpro <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
plot(cumpro[0:200], xlab = "Principal components", ylab = "Amount of explained variance", main = "Cumulative variance plot")
pca_x <- x %*% pca$rotation[,1:100] # taking top 100 PCA axis
# 03. Splitting the train data into 70:30
prop <- 0.7
idx <- sample.int(nrow(pca_x),ceiling(prop*nrow(pca_x)),FALSE)
train.x <- pca_x[idx,]
test.x <- pca_x[-idx,]
train.y <- y[idx]
test.y <- y[-idx]
###########################
## NAIVE BAYES ESTIMATOR ##
###########################
start <- proc.time()
model_nb <- train(train.x,train.y,'nb',trControl=trainControl(method='cv',number=10))
pred_test <- predict(model_nb,test.x)
pred_train <- predict(model_nb,train.x)
end <- proc.time()
time_nb <- end - start
train_acc_nb <- confusionMatrix(train.y,pred_train)
test_acc_nb <- confusionMatrix(test.y,pred_test)
###########################
## K-NEAREST NEIGHBOUR ####
###########################
start <- proc.time()
model_knn <- train(train.x,train.y,
method = "knn",
trControl = trainControl(method='cv',number=10),
tuneLength = 20)
pred_train <- predict(model_knn,train.x)
pred_test <- predict(model_knn,test.x)
end <- proc.time()
time_knn <- end - start
train_acc_knn <- confusionMatrix(train.y,pred_train)
test_acc_knn <- confusionMatrix(test.y,pred_test)
###########################
## SUPPORT VECTOR MACHINE #
###########################
library(e1071)
start <- proc.time()
model_svm <- train( x = train.x,y = train.y,
method = "svmLinear",
trControl = trainControl(method='cv',number=10)
)
pred_test <- predict(model_svm,test.x)
pred_train <- predict(model_svm,train.x)
end <- proc.time()
time_svm <- end - start
train_acc_svm <- confusionMatrix(train.y,pred_train)
test_acc_svm <- confusionMatrix(test.y,pred_test)
##########################
## COMPARISON ##
##########################
train_acc_nb$overall['Accuracy']
train_acc_knn$overall['Accuracy']
train_acc_svm$overall['Accuracy']
test_acc_nb$overall['Accuracy']
test_acc_knn$overall['Accuracy']
test_acc_svm$overall['Accuracy']
time_nb
time_knn
time_svm