R Programming Homework Solution on Methods of Data Analysis

8th Jul 2022
06:03 am
df <- read.csv('FP_dataset.csv')
dim(df) # 31 col ,1508 rows
str(df)

## checking for NA
na_count <-sapply(df, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count) # No na present in the dataset

###### EDA

# Target columns Analysis
library('ggplot2') 
library('dplyr')
library('grid') 
library('gridExtra') 
library('RColorBrewer') 
library('corrplot') 

boxplot(df$ADM_RATE)
summary(df$ADM_RATE)

p1 <- df %>%
  ggplot(aes(ADM_RATE)) +
  geom_histogram(bins = 100, fill = "red") +
  labs(x = "ADM_RATE") +
  ggtitle("ADM_RATE feature distribution")

grid.arrange(p1, layout_matrix = rbind(c(1)))

### Predictor variables
# there are 31 columns out og which 1 is response , left with 30 predictor variable , 
# out of which 2 are categorical namely institution name and postal code and rest are numerical out of the 28 numerical , two are ID columns which we wont be analysisng 
# so we are left with 28 numerical variables

# categorical
# INSTNM

length(unique(df$INSTNM)) # count of unique institutions : 1497

df %>%
  mutate(tar = as.character(INSTNM)) %>%
  group_by(tar) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(10) %>%
  ggplot(aes(reorder(tar, n, FUN = min), n)) +
  geom_col(fill = "blue") +
  coord_flip() +
  labs(x = "INSTNM values", y = "frequency") +
  ggtitle("Top 10 Most frequent INSTNM values")

# state postcode

length(unique(df$STABBR)) #count of unique state post code : 54

df %>%
  mutate(tar = as.character(STABBR)) %>%
  group_by(tar) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(10) %>%
  ggplot(aes(reorder(tar, n, FUN = min), n)) +
  geom_col(fill = "blue") +
  coord_flip() +
  labs(x = "STABBR values", y = "frequency") +
  ggtitle("Top 10 Most frequent STABBR values")

# Numerical
library('data.table')
library('tibble') 
library('tidyr') 
library('stringr') 
library('forcats') 

var_means <- df %>%
  select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
  summarise_all(funs(mean)) %>%
  gather(everything(), key = "feature", value = "mean")

var_medians <- df %>%
  select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
  summarise_all(funs(median)) %>%
  gather(everything(), key = "feature", value = "median")

var_sd <- df %>%
  select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
  summarise_all(funs(sd)) %>%
  gather(everything(), key = "feature", value = "sd")

stat <- df %>%
  select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
  summarise_all(funs(sum(.<0>%
  gather(everything(), key = "feature", value = "zeros") %>%
  left_join(var_means, by = "feature") %>%
  left_join(var_medians, by = "feature") %>%
  left_join(var_sd, by = "feature")

summary(stat)
# Visualising the above

p1 <- stat %>%
  ggplot(aes(mean)) +
  geom_histogram() +
  labs(x = "Feature means") +
  ggtitle("Feature means")

p2 <- stat %>%
  ggplot(aes(median)) +
  geom_histogram() +
  labs(x = "Feature medians") +
  ggtitle("Feature medians")

p3 <- stat %>%
  ggplot(aes(sd)) +
  geom_histogram() +
  labs(x = "Feature std dev")  +
  ggtitle("Feature std dev")

grid.arrange(p1, p2, p3, layout_matrix = rbind(c(1),c(2),c(3)))

# Correlation with the response variable
# Spearman
spearman_correlations <- df %>%
  select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
  cor(df$ADM_RATE, method = "spearman") %>%
  as.tibble() %>%
  rename(cor = V1)

ggplot(spearman_correlations, aes(x=cor)) + geom_histogram() + labs(x = "Spearman Correlation coefficient") + ggtitle("Spearman Correlation of numeric predictor variable vs response variable")

# Pearson
Pearson_correlations <- df %>%
  select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
  cor(df$ADM_RATE, method = "pearson") %>%
  as.tibble() %>%
  rename(cor_p = V1)

ggplot(Pearson_correlations, aes(x=cor_p)) + geom_histogram() + labs(x = "Pearson Correlation coefficient") + ggtitle("Pearson Correlation of numeric predictor variable vs response variable")

# correlation plot (excluding character data type columns)
drop2 <- c('X', 'UNITID','INSTNM','STABBR')
df_ex_char <- df[ , !(names(df) %in% drop2)]
install.packages('corrplot')
library(corrplot)
corrplot(cor(df_ex_char), order = "hclust")

# Important features using xgboost 
install.packages('xgboost')
library(xgboost)
install.packages('Matrix')
library(Matrix)
sparse_matrix <- sparse.model.matrix(ADM_RATE~.-1,data = df_ex_char)
model <- xgboost(data = sparse_matrix, label = df_ex_char$ADM_RATE, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2,eval_metric = "merror", objective = "reg:linear")
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model)
print(xgb.plot.importance(importance_matrix = importance ,top_n = 10))

## Regression Modelling 
lmADM_RATE = lm(ADM_RATE~., data = df_ex_char) #Create the linear regression
summary(lmADM_RATE)
plot(lmADM_RATE$residuals, pch = 16, col = "red")

## Infuential points
plot(cooks.distance(lmADM_RATE), pch = 16, col = "blue")

# deleting influential points and high residual points and then re modelling

##
HighLeverage <- cooks.distance(lmADM_RATE) > (2/nrow(df_ex_char))
LargeResiduals <- rstudent(lmADM_RATE) > 0.01
df_ex_char_influ_removed <- df_ex_char[!HighLeverage & !LargeResiduals,]
lmADM_RATE_influ_removed<-lm(ADM_RATE ~ ., data = df_ex_char_influ_removed)  
summary(lmADM_RATE_influ_removed)
plot(lmADM_RATE_influ_removed$residuals, pch = 16, col = "red")

# as we can see , in deleting the influential points and the points with high residual values, r2 value is increased from 0.24 to 0.76

## Performing xgboost on the data havinng no influential and high residula points
sparse_matrix <- sparse.model.matrix(ADM_RATE~.-1,data = df_ex_char_influ_removed)
model <- xgboost(data = sparse_matrix, label = df_ex_char_influ_removed$ADM_RATE, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2, objective = "reg:linear")
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model)
print(xgb.plot.importance(importance_matrix = importance ,top_n = 10))

## Stepwise regression , variable selection using AIC
df <- read.csv('FP_dataset.csv')

df_ex_char <- df
df_ex_char$X <- NULL
df_ex_char$UNITID <- NULL
df_ex_char$INSTNM <- NULL
df_ex_char$STABBR <- NULL

#drop2 <- c('X', 'UNITID','INSTNM','STABBR')

#df_ex_char <- df[ , !(names(df) %in% drop2)]

lm1 <- lm(ADM_RATE ~ . , data = df_ex_char)
selectedMod1 <- step(lm1)
summary(selectedMod1)

all_vifs <- car::vif(selectedMod1)
print(all_vifs) ##There are variable having more than 4 VIF , indicating multicollinearity

## recursively removoing variable having VIF > 4
signif_all <- names(all_vifs)

# Remove vars with VIF> 4 and re-build model until none of VIFs don't exceed 4.
while(any(all_vifs > 4)){
  var_with_max_vif <- names(which(all_vifs == max(all_vifs)))  # get the var with max vif
  signif_all <- signif_all[!(signif_all) %in% var_with_max_vif]  # remove
  myForm <- as.formula(paste("ADM_RATE ~ ", paste (signif_all, collapse=" + "), sep=""))  # new formula
  selectedMod2 <- lm(myForm, data=df_ex_char)  # re-build model with new formula
  all_vifs <- car::vif(selectedMod2)
}
summary(selectedMod2)

# Recursively remove non-significant variables , on the condition that p value < 0>

all_vars <- names(selectedMod2[[1]])[-1]  # names of all X variables
# Get the non-significant vars
summ <- summary(selectedMod2)  # model summary
pvals <- summ[[4]][, 4]  # get all p values
not_significant <- character()  
not_significant <- names(which(pvals > 0.1))
not_significant <- not_significant[!not_significant %in% "(Intercept)"]  

# If there are any non-significant variables, 
while(length(not_significant) > 0){
  all_vars <- all_vars[!all_vars %in% not_significant[1]]
  myForm <- as.formula(paste("ADM_RATE ~ ", paste (all_vars, collapse=" + "), sep=""))  # new formula
  selectedMod3 <- lm(myForm, data=df_ex_char)  # re-build model with new formula
  
  # Get the non-significant vars.
  summ <- summary(selectedMod3)
  pvals <- summ[[4]][, 4]
  not_significant <- character()
  not_significant <- names(which(pvals > 0.1))
  not_significant <- not_significant[!not_significant %in% "(Intercept)"]
}
summary(selectedMod3)
car::vif(selectedMod3)

### step wise regression on removed influential points and high residuals data

lm1 <- lm(ADM_RATE ~ . , data = df_ex_char_influ_removed)
selectedMod1 <- step(lm1) # this step function iterates and builds model , then select one with least aic.
summary(selectedMod1)

all_vifs <- car::vif(selectedMod1)
print(all_vifs) ##There are variable having more than 4 VIF , indicating multicollinearity

## recursively removoing variable having VIF > 4
signif_all <- names(all_vifs)

# Remove vars with VIF> 4 and re-build model until none of VIFs don't exceed 4.
while(any(all_vifs > 4)){
  var_with_max_vif <- names(which(all_vifs == max(all_vifs)))  # get the var with max vif
  signif_all <- signif_all[!(signif_all) %in% var_with_max_vif]  # remove
  myForm <- as.formula(paste("ADM_RATE ~ ", paste (signif_all, collapse=" + "), sep=""))  # new formula
  selectedMod2 <- lm(myForm, data=df_ex_char_influ_removed)  # re-build model with new formula
  all_vifs <- car::vif(selectedMod2)
}
summary(selectedMod2)

# Recursively remove non-significant variables , on the condition that p value < 0>

all_vars <- names(selectedMod2[[1]])[-1]  # names of all X variables
# Get the non-significant vars
summ <- summary(selectedMod2)  # model summary
pvals <- summ[[4]][, 4]  # get all p values
not_significant <- character()  
not_significant <- names(which(pvals > 0.1))
not_significant <- not_significant[!not_significant %in% "(Intercept)"]  

# If there are any non-significant variables, 
while(length(not_significant) > 0){
  all_vars <- all_vars[!all_vars %in% not_significant[1]]
  myForm <- as.formula(paste("ADM_RATE ~ ", paste (all_vars, collapse=" + "), sep=""))  # new formula
  selectedMod3 <- lm(myForm, data=df_ex_char_influ_removed)  # re-build model with new formula
  
  # Get the non-significant vars.
  summ <- summary(selectedMod3)
  pvals <- summ[[4]][, 4]
  not_significant <- character()
  not_significant <- names(which(pvals > 0.1))
  not_significant <- not_significant[!not_significant %in% "(Intercept)"]
}
summary(selectedMod3)
car::vif(selectedMod3)

## BOX COX transformation on the data

install.packages('schoolmath')
library(schoolmath)

df_ex_char_copy <- data.frame(df_ex_char)

for (i in 1:ncol(df_ex_char_copy)){
  
  if (sum(is.positive(df_ex_char_copy[,i])) == 1508){
    bc <- car::powerTransform(df_ex_char_copy[,i], family="bcPower")
    df_ex_char_copy[,i] <- (df_ex_char_copy[,i]) * (as.numeric(bc[1][1]))
  }
}

# linear regression after box cox transformation of data
lmADM_RATE = lm(ADM_RATE~., data = df_ex_char_copy) #Create the linear regression
summary(lmADM_RATE)
R Programming Homework Solution on Methods of Data Analysis

R Programming Homework Solution on Methods of Data Analysis

Share this post

Recent Blogs