- 4th Nov 2022
- 06:03 am

rm(list = ls())

options(scipen = 999)

set.seed(1)

# Load library:

if(!('ISLR' %in% rownames(installed.packages()))) install.packages('ISLR')

library(ISLR)

if(!('glmnet' %in% rownames(installed.packages()))) install.packages('glmnet')

library(glmnet)

# Load 'College' dataset:

data(College)

College$Private <- ifelse(College$Private == 'Yes',1,0)

# Part(a): Split into train and test:

split_prop <- 0.7

idx <- sample.int(nrow(College),floor(split_prop*nrow(College)),FALSE)

train <- College[idx,]

test <- College[-idx, ]

y_train <- train[,2]

x_train <- as.matrix(train[,-2])

y_test <- test[,2]

x_test <- as.matrix(test[,-2])

# Part(b): Fit Lasso model:

fit <- glmnet(x_train, y_train, family = 'gaussian', alpha = 1)

# Part(c): Cross validation to choose lambda:

cross_validation <- cv.glmnet(x_train, y_train, family = 'gaussian', alpha = 1)

plot(cross_validation)

lambda_cv <- cross_validation$lambda.min

# Part(d): Predicted values of test data based on best lambda:

fit_cv <- glmnet(x_train, y_train, family = 'gaussian', alpha = 1, lambda = lambda_cv)

pred_test <- predict(fit_cv,x_test)

test_MSE <- mean((pred_test - y_test)^2)

# Part(e):

# Test MSE of null model with lambda = infinity

fit_null <- glmnet(x_train, y_train, family = 'gaussian', alpha = 1, lambda = 1e8)

pred_test <- predict(fit_null,x_test)

test_MSE_null <- mean((pred_test - y_test)^2)

#Test MSE of least square regression model with lambda = 0:

fit_ols <- glmnet(x_train, y_train, family = 'gaussian', alpha = 1, lambda = 0)

pred_test <- predict(fit_ols,x_test)

test_MSE_ols <- mean((pred_test - y_test)^2)

# Part(f): Lasso model with whole data:

y <- College[,2]

x <- as.matrix(College[,-2])

fit_lasso <- glmnet(x, y, family = 'gaussian', alpha = 1, lambda = lambda_cv)

coef(fit_lasso)

fit_lasso[["df"]] #Number of non-zero co-effecient

# Part(g): Linear regression model with Lasso predictors:

temp <- coef(fit_lasso)

selected_predictors <- colnames(x)[(temp@i)]

data <- College[,c('Apps',selected_predictors)]

fit_lm <- lm(Apps ~ . , data)

summary(fit_lm)