R Programming Homework Solution on Data Pre-Processing and Exploratory Data Analysis

Data Pre-Processing and Exploratory Data Analysis

15th Jul 2022
06:03 am

R Programming Homework Solution on Data Pre-Processing and Exploratory Data Analysis

# Importing the packages required.
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(lattice)
library(DataExplorer)
library(grDevices)
library(factoextra)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ranger)
library(Metrics)
library(ROCit)
library(kableExtra)
library(GoodmanKruskal)

# Reading data into the environment
df <- read.csv("Coronaryheartriskstudy.csv")
head(df)

# Creating report to understand the data
create_report(df)

# descriptive statistics

mystats=function(x){
  if(class(x)=="numeric"){
    Var_Type=class(x)
    n<-length(x)
    nmiss<-sum(is.na(x))
    mean<-mean(x,na.rm=T)
    std<-sd(x,na.rm=T)
    var<-var(x,na.rm=T)
    min<-min(x,na.rm=T)
    p1<-quantile(x,0.01,na.rm=T)
    p5<-quantile(x,0.05,na.rm=T)
    p10<-quantile(x,0.1,na.rm=T)
    q1<-quantile(x,0.25,na.rm=T)
    q2<-quantile(x,0.5,na.rm=T)
    q3<-quantile(x,0.75,na.rm=T)
    p90<-quantile(x,0.9,na.rm=T)
    p95<-quantile(x,0.95,na.rm=T)
    p99<-quantile(x,0.99,na.rm=T)
    max<-max(x,na.rm=T)
    UC1=mean(x,na.rm=T)+3*sd(x,na.rm=T)
    LC1=mean(x,na.rm=T)-3*sd(x,na.rm=T)
    UC2=quantile(x,0.99,na.rm=T)
    LC2=quantile(x,0.01,na.rm=T)
    iqr=IQR(x,na.rm=T)
    UC3=q3+1.5*iqr
    LC3=q1-1.5*iqr
    ot1<-max>UC1 | min     ot2<-max>UC2 | min     ot3<-max>UC3 | min     return(c(Var_Type=Var_Type, n=n,nmiss=nmiss,mean=mean,std=std,var=var,min=min,p1=p1,p5=p5,p10=p10,q1=q1,q2=q2,q3=q3,p90=p90,p95=p95,p99=p99,max=max,ot_m1=ot1,ot_m2=ot2,ot_m2=ot3))
  }
  else{
    Var_Type=class(x)
    n<-length(x)
    nmiss<-sum(is.na(x))
    fre<-table(x)
    prop<-prop.table(table(x))
    x[is.na(x)]<-x[which.max(prop.table(table(x)))]

    return(c(Var_Type=Var_Type, n=n,nmiss=nmiss,freq=fre,proportion=prop))
  }
}

#Vector of numaerical variables
num_var= sapply(df,is.numeric)
Other_var= !sapply(df,is.numeric)

#Applying above defined function on numerical variables
my_num_data<-t(data.frame(apply(df[num_var], 2, mystats)))
my_cat_data<-data.frame(t(apply(df[Other_var], 2, mystats)))
View(my_num_data)
View(my_cat_data)

#Missing Value Treatment
df[,num_var] <- apply(data.frame(df[,num_var]), 2, function(x){x <- replace(x, is.na(x), mean(x, na.rm=TRUE))})
df[,Other_var] <- apply(data.frame(df[,Other_var]), 2, function(x){x <- replace(x, is.na(x), which.max(prop.table(table(x))))})

# Outlier capping with p95 and p5.
numeric_vars = names(li)[sapply(df, FUN=is.numeric)]

outlier_treat <- function(x){
  UC1 = quantile(x, p=0.95,na.rm=T)
  LC1 = quantile(x, p=0.05,na.rm=T)
  x[x>UC1]=UC1
  x[x
  return(x)
}

mydata_num_vars = data.frame(apply(df[numeric_vars], 2, FUN=outlier_treat))

# Univariate plots of the dataset

all_plots <- lapply(X = 1:ncol(df), function(x) ggplot()+
                      geom_bar(data = df,
                               aes(x = df[,x]), fill = "skyblue" ,
                               alpha = 0.6)+
                      theme_linedraw()+
                      xlab(colnames(df)[x])
)

#colnames(dataset)
marrangeGrob(grobs=all_plots, nrow=2, ncol=2)

# Bivariate plots of the dataset
ptlist_bi <- list()

for (var in colnames(df) ){
  if (class(df[,var]) %in% c("factor","logical") ) {
    ###print(class(dataset[,var]))
    ###print("first if")
    ptlist_bi[[var]] <- ggplot(data = df) +
      geom_bar( aes_string(x = var, fill ="TenYearCHD" ), position = "fill") +
      theme_linedraw() +
      xlab(var)


  } else if (class(df[,var]) %in% c("numeric","double","integer") ) {
    ###print(class(dataset[,var]))
    # #print("second if")
    ptlist_bi[[var]] <- ggplot(data = df) +
      geom_boxplot(aes_string(y = var, x ="TenYearCHD" )) +
      theme_linedraw() +
      xlab("TenYearCHD") +
      ylab(var)
  }
}

#names(ptlist_bi)
marrangeGrob(grobs=ptlist_bi, nrow=2, ncol=2)

# have here, being male is directly related to TenYearCHD, thus the variable male seems a relatively good predictor.
# Similarly, Age seems a good predictor since the patients with TenYearCHD == TRUE, have higher median of age, with
# almost a similar distribution. In contrast, there seems no relation between different categories of the education
# and the response variable. The current Smoker variable shows a slight relation with the response variable, as the
# current smokers have a slightly higher risk of TenYearCHD. With the same manner we can investigate remaining plots.

# Association between qualitative variables

class_list <- lapply(X = 1:ncol(df), function(x) class(df[,x]))
names(class_list) <- colnames(df)

dataset_cat_variables <- subset(x = df, select = names(which(class_list=="logical"| class_list=="factor")))

#colnames(dataset_cat_variables)
dataset_cat_variables$education <- NULL

GKmatrix_dataset <- GKtauDataframe(dataset_cat_variables)
plot(GKmatrix_dataset)

# Modelling

# Load the library caTools
library(caTools)

# Randomly split the data into training and testing sets
set.seed(1000)
# One needs to put between 50% and 80% of data in the training set
split = sample.split(df$TenYearCHD, SplitRatio = 0.65)

# Split up the data using subset
train = subset(df, split == TRUE)
test = subset(df, split == FALSE)

# Logistic Regression Model
framinghamLog = glm(TenYearCHD ~ ., data = train, family = binomial)
summary(framinghamLog)

# Predictions on the test set
predictTest = predict(framinghamLog, type = "response", newdata = test)

# Confusion matrix with threshold of 0.5
ConfMat <- table(test$TenYearCHD, predictTest > 0.5)
ConfMat

# Accuracy
(ConfMat[1, 1] + ConfMat[2, 2])/(ConfMat[1, 1] + ConfMat[1, 2] + ConfMat[2,
1] + ConfMat[2, 2])
# Baseline accuracy
(1069 + 6)/(1069 + 6 + 187 + 11)

library(ROCR)

ROCRpred = prediction(predictTest, test$TenYearCHD)
as.numeric(performance(ROCRpred, "auc")@y.values)

Data Pre-Processing and Exploratory Data Analysis

Data Pre-Processing and Exploratory Data Analysis

R Programming Homework Solution on Data Pre-Processing and Exploratory Data Analysis

Share this post

Recent Blogs