df <- read.csv('FP_dataset.csv')
dim(df) # 31 col ,1508 rows
str(df)
## checking for NA
na_count <-sapply(df, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count) # No na present in the dataset
###### EDA
# Target columns Analysis
library('ggplot2')
library('dplyr')
library('grid')
library('gridExtra')
library('RColorBrewer')
library('corrplot')
boxplot(df$ADM_RATE)
summary(df$ADM_RATE)
p1 <- df %>%
ggplot(aes(ADM_RATE)) +
geom_histogram(bins = 100, fill = "red") +
labs(x = "ADM_RATE") +
ggtitle("ADM_RATE feature distribution")
grid.arrange(p1, layout_matrix = rbind(c(1)))
### Predictor variables
# there are 31 columns out og which 1 is response , left with 30 predictor variable ,
# out of which 2 are categorical namely institution name and postal code and rest are numerical out of the 28 numerical , two are ID columns which we wont be analysisng
# so we are left with 28 numerical variables
# categorical
# INSTNM
length(unique(df$INSTNM)) # count of unique institutions : 1497
df %>%
mutate(tar = as.character(INSTNM)) %>%
group_by(tar) %>%
count() %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(reorder(tar, n, FUN = min), n)) +
geom_col(fill = "blue") +
coord_flip() +
labs(x = "INSTNM values", y = "frequency") +
ggtitle("Top 10 Most frequent INSTNM values")
# state postcode
length(unique(df$STABBR)) #count of unique state post code : 54
df %>%
mutate(tar = as.character(STABBR)) %>%
group_by(tar) %>%
count() %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(reorder(tar, n, FUN = min), n)) +
geom_col(fill = "blue") +
coord_flip() +
labs(x = "STABBR values", y = "frequency") +
ggtitle("Top 10 Most frequent STABBR values")
# Numerical
library('data.table')
library('tibble')
library('tidyr')
library('stringr')
library('forcats')
var_means <- df %>%
select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
summarise_all(funs(mean)) %>%
gather(everything(), key = "feature", value = "mean")
var_medians <- df %>%
select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
summarise_all(funs(median)) %>%
gather(everything(), key = "feature", value = "median")
var_sd <- df %>%
select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
summarise_all(funs(sd)) %>%
gather(everything(), key = "feature", value = "sd")
stat <- df %>%
select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
summarise_all(funs(sum(.<0>%
gather(everything(), key = "feature", value = "zeros") %>%
left_join(var_means, by = "feature") %>%
left_join(var_medians, by = "feature") %>%
left_join(var_sd, by = "feature")
summary(stat)
# Visualising the above
p1 <- stat %>%
ggplot(aes(mean)) +
geom_histogram() +
labs(x = "Feature means") +
ggtitle("Feature means")
p2 <- stat %>%
ggplot(aes(median)) +
geom_histogram() +
labs(x = "Feature medians") +
ggtitle("Feature medians")
p3 <- stat %>%
ggplot(aes(sd)) +
geom_histogram() +
labs(x = "Feature std dev") +
ggtitle("Feature std dev")
grid.arrange(p1, p2, p3, layout_matrix = rbind(c(1),c(2),c(3)))
# Correlation with the response variable
# Spearman
spearman_correlations <- df %>%
select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
cor(df$ADM_RATE, method = "spearman") %>%
as.tibble() %>%
rename(cor = V1)
ggplot(spearman_correlations, aes(x=cor)) + geom_histogram() + labs(x = "Spearman Correlation coefficient") + ggtitle("Spearman Correlation of numeric predictor variable vs response variable")
# Pearson
Pearson_correlations <- df %>%
select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>%
cor(df$ADM_RATE, method = "pearson") %>%
as.tibble() %>%
rename(cor_p = V1)
ggplot(Pearson_correlations, aes(x=cor_p)) + geom_histogram() + labs(x = "Pearson Correlation coefficient") + ggtitle("Pearson Correlation of numeric predictor variable vs response variable")
# correlation plot (excluding character data type columns)
drop2 <- c('X', 'UNITID','INSTNM','STABBR')
df_ex_char <- df[ , !(names(df) %in% drop2)]
install.packages('corrplot')
library(corrplot)
corrplot(cor(df_ex_char), order = "hclust")
# Important features using xgboost
install.packages('xgboost')
library(xgboost)
install.packages('Matrix')
library(Matrix)
sparse_matrix <- sparse.model.matrix(ADM_RATE~.-1,data = df_ex_char)
model <- xgboost(data = sparse_matrix, label = df_ex_char$ADM_RATE, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2,eval_metric = "merror", objective = "reg:linear")
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model)
print(xgb.plot.importance(importance_matrix = importance ,top_n = 10))
## Regression Modelling
lmADM_RATE = lm(ADM_RATE~., data = df_ex_char) #Create the linear regression
summary(lmADM_RATE)
plot(lmADM_RATE$residuals, pch = 16, col = "red")
## Infuential points
plot(cooks.distance(lmADM_RATE), pch = 16, col = "blue")
# deleting influential points and high residual points and then re modelling
##
HighLeverage <- cooks.distance(lmADM_RATE) > (2/nrow(df_ex_char))
LargeResiduals <- rstudent(lmADM_RATE) > 0.01
df_ex_char_influ_removed <- df_ex_char[!HighLeverage & !LargeResiduals,]
lmADM_RATE_influ_removed<-lm(ADM_RATE ~ ., data = df_ex_char_influ_removed)
summary(lmADM_RATE_influ_removed)
plot(lmADM_RATE_influ_removed$residuals, pch = 16, col = "red")
# as we can see , in deleting the influential points and the points with high residual values, r2 value is increased from 0.24 to 0.76
## Performing xgboost on the data havinng no influential and high residula points
sparse_matrix <- sparse.model.matrix(ADM_RATE~.-1,data = df_ex_char_influ_removed)
model <- xgboost(data = sparse_matrix, label = df_ex_char_influ_removed$ADM_RATE, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2, objective = "reg:linear")
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model)
print(xgb.plot.importance(importance_matrix = importance ,top_n = 10))
## Stepwise regression , variable selection using AIC
df <- read.csv('FP_dataset.csv')
df_ex_char <- df
df_ex_char$X <- NULL
df_ex_char$UNITID <- NULL
df_ex_char$INSTNM <- NULL
df_ex_char$STABBR <- NULL
#drop2 <- c('X', 'UNITID','INSTNM','STABBR')
#df_ex_char <- df[ , !(names(df) %in% drop2)]
lm1 <- lm(ADM_RATE ~ . , data = df_ex_char)
selectedMod1 <- step(lm1)
summary(selectedMod1)
all_vifs <- car::vif(selectedMod1)
print(all_vifs) ##There are variable having more than 4 VIF , indicating multicollinearity
## recursively removoing variable having VIF > 4
signif_all <- names(all_vifs)
# Remove vars with VIF> 4 and re-build model until none of VIFs don't exceed 4.
while(any(all_vifs > 4)){
var_with_max_vif <- names(which(all_vifs == max(all_vifs))) # get the var with max vif
signif_all <- signif_all[!(signif_all) %in% var_with_max_vif] # remove
myForm <- as.formula(paste("ADM_RATE ~ ", paste (signif_all, collapse=" + "), sep="")) # new formula
selectedMod2 <- lm(myForm, data=df_ex_char) # re-build model with new formula
all_vifs <- car::vif(selectedMod2)
}
summary(selectedMod2)
# Recursively remove non-significant variables , on the condition that p value < 0>
all_vars <- names(selectedMod2[[1]])[-1] # names of all X variables
# Get the non-significant vars
summ <- summary(selectedMod2) # model summary
pvals <- summ[[4]][, 4] # get all p values
not_significant <- character()
not_significant <- names(which(pvals > 0.1))
not_significant <- not_significant[!not_significant %in% "(Intercept)"]
# If there are any non-significant variables,
while(length(not_significant) > 0){
all_vars <- all_vars[!all_vars %in% not_significant[1]]
myForm <- as.formula(paste("ADM_RATE ~ ", paste (all_vars, collapse=" + "), sep="")) # new formula
selectedMod3 <- lm(myForm, data=df_ex_char) # re-build model with new formula
# Get the non-significant vars.
summ <- summary(selectedMod3)
pvals <- summ[[4]][, 4]
not_significant <- character()
not_significant <- names(which(pvals > 0.1))
not_significant <- not_significant[!not_significant %in% "(Intercept)"]
}
summary(selectedMod3)
car::vif(selectedMod3)
### step wise regression on removed influential points and high residuals data
lm1 <- lm(ADM_RATE ~ . , data = df_ex_char_influ_removed)
selectedMod1 <- step(lm1) # this step function iterates and builds model , then select one with least aic.
summary(selectedMod1)
all_vifs <- car::vif(selectedMod1)
print(all_vifs) ##There are variable having more than 4 VIF , indicating multicollinearity
## recursively removoing variable having VIF > 4
signif_all <- names(all_vifs)
# Remove vars with VIF> 4 and re-build model until none of VIFs don't exceed 4.
while(any(all_vifs > 4)){
var_with_max_vif <- names(which(all_vifs == max(all_vifs))) # get the var with max vif
signif_all <- signif_all[!(signif_all) %in% var_with_max_vif] # remove
myForm <- as.formula(paste("ADM_RATE ~ ", paste (signif_all, collapse=" + "), sep="")) # new formula
selectedMod2 <- lm(myForm, data=df_ex_char_influ_removed) # re-build model with new formula
all_vifs <- car::vif(selectedMod2)
}
summary(selectedMod2)
# Recursively remove non-significant variables , on the condition that p value < 0>
all_vars <- names(selectedMod2[[1]])[-1] # names of all X variables
# Get the non-significant vars
summ <- summary(selectedMod2) # model summary
pvals <- summ[[4]][, 4] # get all p values
not_significant <- character()
not_significant <- names(which(pvals > 0.1))
not_significant <- not_significant[!not_significant %in% "(Intercept)"]
# If there are any non-significant variables,
while(length(not_significant) > 0){
all_vars <- all_vars[!all_vars %in% not_significant[1]]
myForm <- as.formula(paste("ADM_RATE ~ ", paste (all_vars, collapse=" + "), sep="")) # new formula
selectedMod3 <- lm(myForm, data=df_ex_char_influ_removed) # re-build model with new formula
# Get the non-significant vars.
summ <- summary(selectedMod3)
pvals <- summ[[4]][, 4]
not_significant <- character()
not_significant <- names(which(pvals > 0.1))
not_significant <- not_significant[!not_significant %in% "(Intercept)"]
}
summary(selectedMod3)
car::vif(selectedMod3)
## BOX COX transformation on the data
install.packages('schoolmath')
library(schoolmath)
df_ex_char_copy <- data.frame(df_ex_char)
for (i in 1:ncol(df_ex_char_copy)){
if (sum(is.positive(df_ex_char_copy[,i])) == 1508){
bc <- car::powerTransform(df_ex_char_copy[,i], family="bcPower")
df_ex_char_copy[,i] <- (df_ex_char_copy[,i]) * (as.numeric(bc[1][1]))
}
}
# linear regression after box cox transformation of data
lmADM_RATE = lm(ADM_RATE~., data = df_ex_char_copy) #Create the linear regression
summary(lmADM_RATE)