### PART A ###
# Q1
pairs(shellfish)
# Q2
# From the plot it is very clear that their is strong relationship between predictor and response variables
# from the scatter plot matrix as we can see clearly that MMass is linearly related with Shell and width but quadratic related with Length and Height
# Q3
# From the scatter plot matrix it is very clear that relationship between predictor and response variable is very strong. So here we can apply Multiple linear regression on the dataset.
# One problem which is their is that the predictors variables are correlated with each other strongly which creates the voilation of assumption of multiple linear regression i.e., Multicollinearity
# Q4
hist(shellfish$MMass)
# AS we can see that response variable is not symmetric so here we will apply BOXCOX power transformation to convert it into normal disribution
model <- lm(MMass~., data = shellfish)
library(MASS)
box <- boxcox(model)
box$x[which.max(box$y)]
hist(shellfish$MMass^(0.505050))
# Now as we can see that histogram of transformed response variable is normally distributed
# Now we produce the scatterplot of transformed data
shellfish$MMass_t = shellfish$MMass^(0.505050)
pairs(shellfish[, c(1:4,6) ])
# Q5
m1 <- lm(MMass_t~.-MMass, data = shellfish)
m1
library(car)
vif(m1)
# As we can see that all variables have variance Inflation Factor greater than 10 which implies that multiple correlation coefficeint of these variables are more than 0.9 which is very high
# only variable Shell has VIF less than 8.06 but this is also very high
# So it implies that there is severe multicollinearity in the model
# Q6
# Here we will perform stepwise regression procedure to select the best model
# We will apply forward selection method to select the best model
library(leaps)
m2 <- regsubsets(MMass^0.505050~.-MMass_t, data = shellfish, method = "forward")
summ <- summary(m2)
summ
names(summ)
summ$adjr2
# So here as we can see that 1st model has only selected variable Shell and we will use adjusted r square criteria to select the best model
# So we will select the 3rd model which have Adj R square 0.9014371
# Subsequent model have adj R square does not improve much
# So model is
m2 <- lm(MMass_t~Height+Shell+Width, data = shellfish)
# Q 7
m2
# So final model is
# MMass^(0.505050) = -0.785700 + 0.029063 Height + 0.005439 Shell + 0.029616 Width
# Q 8
vif(m2)
# In this model as we can see that VIF correspoding to Width is still greater than 10 and others are greater than 5
# Here we see the changes because as we can see that here we have taken only 3 variables and in m1 we have taken all 4 variables
# Q 9
plot(m2)
# from the above plots we can say their are no problem in the model
# from 2nd plot we can clearly see that residuals and theoritical values coincide with each other
# From plot 3nd plot we can say that their is no specific pattern in the residulas vs fitted values i.e., their is no heterescadasticity and their are no outliers in the data
# Q10
summary(m2)
# form the above output as we can see that only varibale Height an Shell are significant at alpha = 0.05 level of significance since p value corresponding to them is less than 0.05
# but variable Width is not coming out to be significant at alpha = 0.05 level of significance
# Q11
predict.lm(m2, newdata = data.frame(Height = 141, Length = 331, Shell = 360, Width = 46))
# So this is the value of MMass^0.50505 = 6.63242
# So value of MMass = 6.63242^(1/0.505050) = 42.35574
### PART B ###
# Q 1
library(ggplot2)
gp <- ggplot(bp, aes(x = Age, y = BP, colour = Sex)) + geom_point()
gp
# Q 2
m <- aov(BP~Age+Sex+Age*Sex, data = bp)
summary.aov(m)
# Q 3
# As we can see that p value corrresponding to Age:Sex is 0.0375 < alpha> # Which implies that Interaction term of Age and Sex is significant
# Q 4
model_q4 <- lm(BP~Age, data = bp)
summary(model_q4)
# Q 5
gp + geom_point() + stat_smooth(method = "lm")
# Q 6
# For Male
model_male <- lm(BP~Age, data = bp[bp$Sex=="Male", ])
summary.lm(model_male)
# For Female
model_female <- lm(BP~Age, data = bp[bp$Sex=="Female", ])
summary.lm(model_female)
# Q 7
# a)
# As we can see that both the models for male and female have p value corresponding to F statistic is less than 0.05 which implies both the model are significant
# b)
# As we can see that male coefficient for Age of Male is 0.404616 which implies that of we increase the age by 1 year than BP increses by 0.40616
# And has standard deviation of 0.05745
# As we can see that female coefficient for Age of female is 0.55177 which implies that of we increase the age by 1 year than BP increses by 0.55177
# And has standard deviation of 0.09703
# As we can see the coefficient corresponding Female is more than as that of male
# c)
# As we can for both male and female R square is nearly 0.669 which implies that 66.9% of variation of BP is explained by Age