- 20th Oct 2022
- 06:03 am
## Data Initialization
library(caret)
library(caTools)
library(xgboost)
library(MLmetrics)
library(h2o)
library(e1071)
library(ggplot2)
train_raw <- read.csv("/train.csv")
validate <- read.csv("test.csv")
#Lets count the no. of NA values
sapply(train_raw, function(x) sum(is.na(x)))
## Principal component analysis
data <- train_raw[,-(1:2)]
covX <- cov(data) # Covariance matrix
pca <- prcomp(covX) # Perform PCA
#Variance Explained
var_exp <- as.data.frame(pca$sdev^2/sum(pca$sdev^2))
var_exp <- cbind(c(1:ncol(data)),var_exp,cumsum(var_exp[,1]))
colnames(var_exp) <- c("Principal_Components","Variance","Cumulative_Variance")
#Plotting the Variance Curves
#Individual Variance
plot(var_exp$Principal_Components,var_exp$Variance,type='b',xlim=c(0,50),pch=16,xlab = "Principal Componets",ylab = "Variance",main = 'Principal Components vs Variance')
## XGBoost Classifier
pca_fin <- pca$rotation[,1:22] # Rotaion matrix (194x22)
PCA <- function(X) { # Reduce observations from N x 194 to N x 22
as.matrix(X) %*% pca_fin
}
train_pca_X <- PCA(train_X)
test_pca_X <- PCA(test_X)
xgb.grid <- expand.grid(
nrounds=100,
max_depth=c(5,10,15),
eta=c(0.5,0.2,0.1),
gamma=c(0,0.5),
colsample_bytree=0.75,
min_child_weight=5,
subsample=0.66
)
xgb.trcontrol <- trainControl(
method="cv",
number=3,
verboseIter=TRUE,
returnData=FALSE,
returnResamp="all",
classProbs=TRUE,
allowParallel=TRUE
)
system.time(xgb_m2 <- train(x=train_pca_X,y=train_Y,
verbose=1,
trControl=xgb.trcontrol,
tuneGrid=xgb.grid,
method="xgbTree"
))
## Naive bayes
pca_data <- data.frame(train_raw$species,train_pca_X)
system.time(NB<- naiveBayes(train_raw.species~.,pca_data))
pred <- predict(NB,newdata=pca_data[,-1],type='raw')
error_nb <- MultiLogLoss(y_true = pca_data[,1], y_pred = as.matrix(pred))
error_nb
## H2o
train.id <- train_raw$id
train_raw$id <- NULL
test.id <- validate$id
validate$id <- NULL
validate$species <- NA
#We will create a local instance of the h2o platform in order to be able to create layers for deep learning and make predictions.
localH2O <- h2o.init(max_mem_size = "12g")
h2o.train <- as.h2o(train_raw)
h2o.test <- as.h2o(validate)
set.seed(13579)
## Deep learning with two hidden layers (1024, 512).
system.time(model <- h2o.deeplearning(x = 2:ncol(h2o.train),
y = 1,
training_frame = h2o.train,
activation = "TanhWithDropout",
input_dropout_ratio = 0,
hidden_dropout_ratios = c(0.1,0.1),
balance_classes = F,
hidden = c(1024,512),
epochs = 250,
loss = "CrossEntropy",
categorical_encoding = "OneHotInternal"))
#prediction
save(model, file = "h2omodel.RData")
ytrain <- h2o.predict(model, h2o.train, type = 'raw')
error_dl <- MultiLogLoss(y_true = train_raw[,1], y_pred = as.matrix(ytrain[,-1]))
error_dl
h2o.shutdown()
## Model Comparison
error <- data.frame(Model=c("XGBoost","Naive Bayes","Deep Learning"),Error=c(error_xgb,error_nb,error_dl))
ggplot(error,aes(x=Model,y=Error))+geom_bar(stat='identity')+theme_bw()+
ggtitle('Comparison of Model Accuracy')
## Observations
# From the chart we can clearly see that Deep Learning provides the best accuracy, that coupled with the absence of any data prep and the relatively low computation time of 68 secs for learning, Deep Learning provides the best option for the current problem.
# Another interesting thing to note is that Naive Bayes, a basic classifier provides better accuracy than the more evolved XGboost algorithm.