- 15th Jul 2022
- 06:03 am
R Programming Homework Solution on Data Pre-Processing and Exploratory Data Analysis
# Importing the packages required.
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(lattice)
library(DataExplorer)
library(grDevices)
library(factoextra)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ranger)
library(Metrics)
library(ROCit)
library(kableExtra)
library(GoodmanKruskal)
# Reading data into the environment
df <- read.csv("Coronaryheartriskstudy.csv")
head(df)
# Creating report to understand the data
create_report(df)
# descriptive statistics
mystats=function(x){
if(class(x)=="numeric"){
Var_Type=class(x)
n<-length(x)
nmiss<-sum(is.na(x))
mean<-mean(x,na.rm=T)
std<-sd(x,na.rm=T)
var<-var(x,na.rm=T)
min<-min(x,na.rm=T)
p1<-quantile(x,0.01,na.rm=T)
p5<-quantile(x,0.05,na.rm=T)
p10<-quantile(x,0.1,na.rm=T)
q1<-quantile(x,0.25,na.rm=T)
q2<-quantile(x,0.5,na.rm=T)
q3<-quantile(x,0.75,na.rm=T)
p90<-quantile(x,0.9,na.rm=T)
p95<-quantile(x,0.95,na.rm=T)
p99<-quantile(x,0.99,na.rm=T)
max<-max(x,na.rm=T)
UC1=mean(x,na.rm=T)+3*sd(x,na.rm=T)
LC1=mean(x,na.rm=T)-3*sd(x,na.rm=T)
UC2=quantile(x,0.99,na.rm=T)
LC2=quantile(x,0.01,na.rm=T)
iqr=IQR(x,na.rm=T)
UC3=q3+1.5*iqr
LC3=q1-1.5*iqr
ot1<-max>UC1 | min ot2<-max>UC2 | min ot3<-max>UC3 | min return(c(Var_Type=Var_Type, n=n,nmiss=nmiss,mean=mean,std=std,var=var,min=min,p1=p1,p5=p5,p10=p10,q1=q1,q2=q2,q3=q3,p90=p90,p95=p95,p99=p99,max=max,ot_m1=ot1,ot_m2=ot2,ot_m2=ot3))
}
else{
Var_Type=class(x)
n<-length(x)
nmiss<-sum(is.na(x))
fre<-table(x)
prop<-prop.table(table(x))
x[is.na(x)]<-x[which.max(prop.table(table(x)))]
return(c(Var_Type=Var_Type, n=n,nmiss=nmiss,freq=fre,proportion=prop))
}
}
#Vector of numaerical variables
num_var= sapply(df,is.numeric)
Other_var= !sapply(df,is.numeric)
#Applying above defined function on numerical variables
my_num_data<-t(data.frame(apply(df[num_var], 2, mystats)))
my_cat_data<-data.frame(t(apply(df[Other_var], 2, mystats)))
View(my_num_data)
View(my_cat_data)
#Missing Value Treatment
df[,num_var] <- apply(data.frame(df[,num_var]), 2, function(x){x <- replace(x, is.na(x), mean(x, na.rm=TRUE))})
df[,Other_var] <- apply(data.frame(df[,Other_var]), 2, function(x){x <- replace(x, is.na(x), which.max(prop.table(table(x))))})
# Outlier capping with p95 and p5.
numeric_vars = names(li)[sapply(df, FUN=is.numeric)]
outlier_treat <- function(x){
UC1 = quantile(x, p=0.95,na.rm=T)
LC1 = quantile(x, p=0.05,na.rm=T)
x[x>UC1]=UC1
x[x
return(x)
}
mydata_num_vars = data.frame(apply(df[numeric_vars], 2, FUN=outlier_treat))
# Univariate plots of the dataset
all_plots <- lapply(X = 1:ncol(df), function(x) ggplot()+
geom_bar(data = df,
aes(x = df[,x]), fill = "skyblue" ,
alpha = 0.6)+
theme_linedraw()+
xlab(colnames(df)[x])
)
#colnames(dataset)
marrangeGrob(grobs=all_plots, nrow=2, ncol=2)
# Bivariate plots of the dataset
ptlist_bi <- list()
for (var in colnames(df) ){
if (class(df[,var]) %in% c("factor","logical") ) {
###print(class(dataset[,var]))
###print("first if")
ptlist_bi[[var]] <- ggplot(data = df) +
geom_bar( aes_string(x = var, fill ="TenYearCHD" ), position = "fill") +
theme_linedraw() +
xlab(var)
} else if (class(df[,var]) %in% c("numeric","double","integer") ) {
###print(class(dataset[,var]))
# #print("second if")
ptlist_bi[[var]] <- ggplot(data = df) +
geom_boxplot(aes_string(y = var, x ="TenYearCHD" )) +
theme_linedraw() +
xlab("TenYearCHD") +
ylab(var)
}
}
#names(ptlist_bi)
marrangeGrob(grobs=ptlist_bi, nrow=2, ncol=2)
# have here, being male is directly related to TenYearCHD, thus the variable male seems a relatively good predictor.
# Similarly, Age seems a good predictor since the patients with TenYearCHD == TRUE, have higher median of age, with
# almost a similar distribution. In contrast, there seems no relation between different categories of the education
# and the response variable. The current Smoker variable shows a slight relation with the response variable, as the
# current smokers have a slightly higher risk of TenYearCHD. With the same manner we can investigate remaining plots.
# Association between qualitative variables
class_list <- lapply(X = 1:ncol(df), function(x) class(df[,x]))
names(class_list) <- colnames(df)
dataset_cat_variables <- subset(x = df, select = names(which(class_list=="logical"| class_list=="factor")))
#colnames(dataset_cat_variables)
dataset_cat_variables$education <- NULL
GKmatrix_dataset <- GKtauDataframe(dataset_cat_variables)
plot(GKmatrix_dataset)
# Modelling
# Load the library caTools
library(caTools)
# Randomly split the data into training and testing sets
set.seed(1000)
# One needs to put between 50% and 80% of data in the training set
split = sample.split(df$TenYearCHD, SplitRatio = 0.65)
# Split up the data using subset
train = subset(df, split == TRUE)
test = subset(df, split == FALSE)
# Logistic Regression Model
framinghamLog = glm(TenYearCHD ~ ., data = train, family = binomial)
summary(framinghamLog)
# Predictions on the test set
predictTest = predict(framinghamLog, type = "response", newdata = test)
# Confusion matrix with threshold of 0.5
ConfMat <- table(test$TenYearCHD, predictTest > 0.5)
ConfMat
# Accuracy
(ConfMat[1, 1] + ConfMat[2, 2])/(ConfMat[1, 1] + ConfMat[1, 2] + ConfMat[2,
1] + ConfMat[2, 2])
# Baseline accuracy
(1069 + 6)/(1069 + 6 + 187 + 11)
library(ROCR)
ROCRpred = prediction(predictTest, test$TenYearCHD)
as.numeric(performance(ROCRpred, "auc")@y.values)