R Programming Homework Help on K-Nearest Neighbors

31st Mar 2022
06:03 am

bank.df = read.csv("UniversalBank.csv", header = T)
dim(bank.df)
head(bank.df)

#removing the column of ID and ZIP Code
bank.df <- bank.df[,-c(1, 5)]
bank.df

set.seed(12345)

# Reordering the variables by putting the response variable in the last
bank.df <- bank.df[ , c(1:7, 9:12, 8)]
bank.df

# Looking at the new order
t(t(names(bank.df)))

#checking the missing values
sum(is.na(bank.df))

#Partitioning the data into training and validation sets
train.index<- sample(row.names(bank.df), 0.6*dim(bank.df)[1])
valid.index<- setdiff(row.names(bank.df), train.index)
train.df<- bank.df [train.index, ]
valid.df<- bank.df [valid.index, ]

new.df<- data.frame(Age= 40, Experience= 10, Income=84, Family = 2, CCAvg=2, Education= 2, Mortgage =0, SecuritiesAccount= 0, CDAccount =0, Online=1, CreditCard= 1)

#initialize normalized training, validation data, complete data frames to originals

train.norm.df<- train.df
valid.norm.df<- valid.df
bank.norm.df<- bank.df

library(caret)
library(lattice)
library(ggplot2)

#use preProces from Caret package to normalize variables

norm.values<- preProcess(train.df[,1:11], method = c("center", "scale"))
train.norm.df[, 1:12] <- predict(norm.values, train.df[, 1:12])
valid.norm.df[, 1:12] <- predict(norm.values, valid.df[, 1:12])
bank.norm.df[, 1:12] <- predict(norm.values, bank.df[, 1:12])

new.norm.df<- predict(norm.values, new.df)

#Accuracies for odd values of k

accuracy.df<- data.frame (k= seq(1,19,2), accuracy=rep(0,10))

library(FNN)

for (i in 1:10){
  
  knn.pred<- knn(train.norm.df[, 1:11], valid.norm.df[, 1:11],
                 cl= train.norm.df [, 12], k= accuracy.df$k[i])
  accuracy.df[i, 2] = confusionMatrix(factor(knn.pred), factor(valid.norm.df[, 12]))$overall[1]
}

#Finding the best value of k

accuracy.df$k[which.max(accuracy.df$accuracy)]