---
title: "R Assignment"
author: "***"
date: "07/06/2020"
output:
html_document:
df_print: paged
pdf_document: default
word_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(warn = -1)
```
## Data processing
```{r}
library(readr)
ToyotaCorolla <- read_csv("ToyotaCorolla.csv")
ToyotaCorolla = na.omit(ToyotaCorolla)
set.seed(12345)
n = nrow(ToyotaCorolla)
set_train = sample(1:n, floor(.6*n), replace = F)
train = ToyotaCorolla[set_train,]
test = ToyotaCorolla[-set_train,]
```
The dataset is divided into training and test set.
## Question 1
```{r}
library(rpart)
library(rpart.plot)
library(Metrics)
attach(train)
regTree = rpart(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, method = "anova", control = rpart.control(cp = 0.0001, minbucket = 1), data = train)
rpart.plot(regTree)
predicted_train = predict(regTree)
predicted_test = predict(regTree, test)
rmse(train$Price, predicted_train)
rmse(test$Price, predicted_test)
```
Some of the important variables are Age_08_04, KM, Automatic_airco, Quarterly_Tax and HP. The Test RMSE values are far too high as compared to the train RMSE value indicating that the model is over fitting. Now we shall change hyper parameters to make a better fit.
```{r}
regTree_1 = rpart(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, method = "anova", control = rpart.control(cp = 0.00, minbucket = 25), data = train)
rpart.plot(regTree_1)
predicted_train_1 = predict(regTree_1)
predicted_test_1 = predict(regTree_1, test)
rmse(train$Price, predicted_train_1)
rmse(test$Price, predicted_test_1)
```
Here we see that the test RMSE is slightly lower than that of train RMSE as well as test RMSE of model 1 indicting that we have a better fit.
Now, we shall use the cross validation
```{r}
plotcp(regTree_1)
regTree_2 = rpart(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, method = "anova", control = rpart.control(cp = 0.011, minbucket = 25), data = train)
rpart.plot(regTree_2)
predicted_train_2 = predict(regTree_2)
predicted_test_2 = predict(regTree_2, test)
rmse(train$Price, predicted_train_2)
rmse(test$Price, predicted_test_2)
```
We have fitted the cp 0.011.
```{r}
obs = test[1,]
obs$Age_08_04 = 77
obs$KM = 117000
obs$Fuel_Type = "Petrol"
obs$HP = 110
obs$Automatic
obs$Doors = 5
obs$Quarterly_Tax = 100
obs$Mfr_Guarantee
obs$Mfr_Guarantee
obs$Guarantee_Period
obs$Airco
obs$Automatic_airco
obs$CD_Player = 0
obs$Powered_Windows
obs$Sport_Model
obs$Tow_Bar = 1
predict(regTree_2, obs)
```
The price of the given car is \$8066.45
## Question 2
```{r}
summary(ToyotaCorolla$Price)
x = range(ToyotaCorolla$Price)
seq_cut = seq(x[1], x[2]+1, by = (x[2]+1-x[1])/20)
ToyotaCorolla$Price_1 = cut(ToyotaCorolla$Price, seq_cut, right=FALSE, labels=c(1:20))
train = ToyotaCorolla[set_train,]
test = ToyotaCorolla[-set_train,]
decTree = rpart(Price_1 ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, method = "class", control = rpart.control(cp = 0.0001, minbucket = 1), data = train)
rpart.plot(decTree)
decTree$variable.importance
printcp(decTree)
new_pred = predict(decTree, obs)
cat("The range is \t", seq_cut[4], " and \t", seq_cut[5])
```
Some of the important variables are Age_08_04, KM, Quarterly_Tax, HP and Doors. These are similar to the first model apart from Doors replacing Automatic\_Airco. There are total 312 splits in the decision tree.
The same car is predicted in the interval of 4 which is basically is the range in 8572.65 and 9980.2 which contains the prediction in first part.