R Programming Assignment Solution on Logistic Regression

20th Sep 2022
06:03 am

rm(list = ls())
#install.packages("AER")
library(AER)
library(ggplot2)
library(dplyr)

# read data:
data(CreditCard)
CreditCard <- CreditCard[,1:8]

# Part(a): Summary of predictors:
summary(CreditCard[,-1])

# Part(b): subset cases where age>18
CreditCard <- CreditCard[CreditCard$age>18,]

# Part(c): Plot income vs reports:
idx <- CreditCard$card == 'yes'
ggplot(CreditCard[idx,], aes(x = income,y = reports)) +
geom_point(alpha = 0.4,col='blue') +
geom_point(data = CreditCard[!idx,],aes(x = income,y = reports),col='red',alpha = 0.4)

# Part(d): Box-plot of income as a function of card acceptance status:
ggplot(CreditCard, aes(x=income, y = card)) +
geom_boxplot(col = c('blue','red'))

# Part(e): Histogram of predictors:
cols = colnames(CreditCard)[-1]
par(mfrow = c(2,4))
sapply(cols, function(col){
hist(as.numeric(CreditCard[,col]),xlab = col,main='')
})

# Transforming predictor to takeinto care of skewness:
CreditCard$share = log(CreditCard$share)
CreditCard$reports = log(CreditCard$reports + 1)

# Part(f): Fit glm on predictors
fit <- glm(card ~ ., data = CreditCard, family = binomial(link= logit))
summary(fit)

# Part(g): Compute confusion matrix
pred_prob = fit %>% predict(CreditCard[,-1],type = 'response')
pred_acceptance = ifelse(pred_prob > 0.5, 'yes', 'no')
table(CreditCard$card,pred_acceptance)
# Overall correct prediction = (295 + 996) / (295 + 996 + 21) = 98%
# False positive cases: 0
# False negative cases: 21

# Part(h): Fit model on train and confusion matrix on test:
train_data <- CreditCard[1:1000, ]
test_data <- CreditCard[1001:nrow(CreditCard), ]
fit <- glm(card ~ ., data = train_data, family = binomial(link= logit))
pred_prob = fit %>% predict(test_data[,-1],type = 'response')
pred_acceptance = ifelse(pred_prob > 0.5, 'yes', 'no')
table(test_data$card,pred_acceptance)
# Overall correct prediction = (74 + 232) / (74 + 6 + 232) = 98%
# False positive cases: 0
# False negative cases: 6