bank.df = read.csv("UniversalBank.csv", header = T)
dim(bank.df)
head(bank.df)
#removing the column of ID and ZIP Code
bank.df <- bank.df[,-c(1, 5)]
bank.df
set.seed(12345)
# Reordering the variables by putting the response variable in the last
bank.df <- bank.df[ , c(1:7, 9:12, 8)]
bank.df
# Looking at the new order
t(t(names(bank.df)))
#checking the missing values
sum(is.na(bank.df))
#Partitioning the data into training and validation sets
train.index<- sample(row.names(bank.df), 0.6*dim(bank.df)[1])
valid.index<- setdiff(row.names(bank.df), train.index)
train.df<- bank.df [train.index, ]
valid.df<- bank.df [valid.index, ]
new.df<- data.frame(Age= 40, Experience= 10, Income=84, Family = 2, CCAvg=2, Education= 2, Mortgage =0, SecuritiesAccount= 0, CDAccount =0, Online=1, CreditCard= 1)
#initialize normalized training, validation data, complete data frames to originals
train.norm.df<- train.df
valid.norm.df<- valid.df
bank.norm.df<- bank.df
library(caret)
library(lattice)
library(ggplot2)
#use preProces from Caret package to normalize variables
norm.values<- preProcess(train.df[,1:11], method = c("center", "scale"))
train.norm.df[, 1:12] <- predict(norm.values, train.df[, 1:12])
valid.norm.df[, 1:12] <- predict(norm.values, valid.df[, 1:12])
bank.norm.df[, 1:12] <- predict(norm.values, bank.df[, 1:12])
new.norm.df<- predict(norm.values, new.df)
#Accuracies for odd values of k
accuracy.df<- data.frame (k= seq(1,19,2), accuracy=rep(0,10))
library(FNN)
for (i in 1:10){
knn.pred<- knn(train.norm.df[, 1:11], valid.norm.df[, 1:11],
cl= train.norm.df [, 12], k= accuracy.df$k[i])
accuracy.df[i, 2] = confusionMatrix(factor(knn.pred), factor(valid.norm.df[, 12]))$overall[1]
}
#Finding the best value of k
accuracy.df$k[which.max(accuracy.df$accuracy)]