df <- read.csv('HealthExpend.csv')
head(df)
summary(df)
## outliers in dependent variables
boxplot(df$EXPENDOP)
boxplot(df$EXPENDIP)
## line plot
plot(df$EXPENDOP,type = "o")
## dropping EXPENDIP
drop <- c("EXPENDIP")
df <- df[ , !(names(df) %in% drop)]
names(df)
dim(df)
## data types
install.packages('purrr')
library(purrr)
map(df,class) # to know the data types of the columns
## levels of each variable (charecter & integer)
drop1 <- c("EXPENDOP")
df_char_int <- df[ , !(names(df) %in% drop1)]
install.packages('dplyr')
library(dplyr)
df_char_int %>%summarise_each(funs(n_distinct))
## checking for NA
na_count <-sapply(df, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
#lapply(df, function(x) sum(is.na(x)))
## unique values of INDUSCLASS
unique(df$INDUSCLASS)
sum(df$INDUSCLASS == "") ## 888 observations have blank category
# one of the class is blank i.e '' , which is around 44.4 %
## correlation plot (excluding character data type columns)
drop2 <- c("INDUSCLASS","PHSTAT","INCOME","MARISTAT","EDUC","REGION","RACE")
df_ex_char <- df[ , !(names(df) %in% drop2)]
install.packages('corrplot')
library(corrplot)
corrplot(cor(df_ex_char), order = "hclust")
###### Analyisis on Independent variables
boxplot(df$ï..AGE)
summary(df$ï..AGE)
# count of each levels of each column
install.packages('ggplot2')
library(ggplot2)
qplot(df$EDUC, geom="histogram") # to plot for integer variables
barplot(prop.table(table(df$INDUSCLASS))) # to plot for categ variables
#### Feature Importance
# Since INDUSCLASS column has 44.4 % of values as blank , we will drop this column
# Also dropping all the categorical columns like (RACE , INCOME etc , since we already have
#label encoded columns of these , such as RACE1 , INCOME1 etc)
drop3 <- c("INDUSCLASS")
df_final <- df[ , !(names(df) %in% drop2)]
names(df_final)
dim(df_final) # 20 columns and 2000 rows
# Feature importance using xgboost with presence of outlier in dependent variable
install.packages('xgboost')
library(xgboost)
install.packages('Matrix')
library(Matrix)
sparse_matrix <- sparse.model.matrix(EXPENDOP~.-1,data = df_final)
model <- xgboost(data = sparse_matrix, label = df_final$EXPENDOP, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2, objective = "reg:linear")
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model)
print(xgb.plot.importance(importance_matrix = importance ,top_n = 10))
# Feature importance using xgboost without presence of outlier in dependent variable
df_finall <- df_final[df_final$EXPENDOP <= 1155, ] # removong outliers , 499 rows removed
sparse_matrix <- sparse.model.matrix(EXPENDOP~.-1,data = df_finall)
model <- xgboost(data = sparse_matrix, label = df_finall$EXPENDOP, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2, objective = "reg:linear")
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model)
print(xgb.plot.importance(importance_matrix = importance ))