- 20th Sep 2022
- 06:03 am

rm(list = ls())

#install.packages("AER")

library(AER)

library(ggplot2)

library(dplyr)

# read data:

data(CreditCard)

CreditCard <- CreditCard[,1:8]

# Part(a): Summary of predictors:

summary(CreditCard[,-1])

# Part(b): subset cases where age>18

CreditCard <- CreditCard[CreditCard$age>18,]

# Part(c): Plot income vs reports:

idx <- CreditCard$card == 'yes'

ggplot(CreditCard[idx,], aes(x = income,y = reports)) +

geom_point(alpha = 0.4,col='blue') +

geom_point(data = CreditCard[!idx,],aes(x = income,y = reports),col='red',alpha = 0.4)

# Part(d): Box-plot of income as a function of card acceptance status:

ggplot(CreditCard, aes(x=income, y = card)) +

geom_boxplot(col = c('blue','red'))

# Part(e): Histogram of predictors:

cols = colnames(CreditCard)[-1]

par(mfrow = c(2,4))

sapply(cols, function(col){

hist(as.numeric(CreditCard[,col]),xlab = col,main='')

})

# Transforming predictor to takeinto care of skewness:

CreditCard$share = log(CreditCard$share)

CreditCard$reports = log(CreditCard$reports + 1)

# Part(f): Fit glm on predictors

fit <- glm(card ~ ., data = CreditCard, family = binomial(link= logit))

summary(fit)

# Part(g): Compute confusion matrix

pred_prob = fit %>% predict(CreditCard[,-1],type = 'response')

pred_acceptance = ifelse(pred_prob > 0.5, 'yes', 'no')

table(CreditCard$card,pred_acceptance)

# Overall correct prediction = (295 + 996) / (295 + 996 + 21) = 98%

# False positive cases: 0

# False negative cases: 21

# Part(h): Fit model on train and confusion matrix on test:

train_data <- CreditCard[1:1000, ]

test_data <- CreditCard[1001:nrow(CreditCard), ]

fit <- glm(card ~ ., data = train_data, family = binomial(link= logit))

pred_prob = fit %>% predict(test_data[,-1],type = 'response')

pred_acceptance = ifelse(pred_prob > 0.5, 'yes', 'no')

table(test_data$card,pred_acceptance)

# Overall correct prediction = (74 + 232) / (74 + 6 + 232) = 98%

# False positive cases: 0

# False negative cases: 6