4. Auto {ISLR} R Documentation
Auto Data Set
Description
Gas mileage, horsepower, and other information for 392 vehicles.
Format
A data frame with 392 observations on the following 9 variables.
mpg
miles per gallon
cylinders
Number of cylinders between 4 and 8
displacement
Engine displacement (cu. inches)
horsepower
Engine horsepower
weight
Vehicle weight (lbs.)
acceleration
Time to accelerate from 0 to 60 mph (sec.)
year
Model year (modulo 100)
origin
Origin of car (1. American, 2. European, 3. Japanese)
name
Vehicle name
8. Check if the train dataset and the test dataset add up to the
original dataset
#@ Check if the train dataset and the test dataset add up to the original dataset. ----
bind_rows(dataset.train, dataset.test) %>% arrange(rownum)
# # A tibble: 392 x 10
# rownum mpg cylinders displacement horsepower weight acceleration year
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 18 8 307 130 3504 12 70
# 2 2 15 8 350 165 3693 11.5 70
# 3 3 18 8 318 150 3436 11 70
# 4 4 16 8 304 150 3433 12 70
# 5 5 17 8 302 140 3449 10.5 70
# 6 6 15 8 429 198 4341 10 70
# 7 7 14 8 454 220 4354 9 70
# 8 8 14 8 440 215 4312 8.5 70
# 9 9 14 8 455 225 4425 10 70
# 10 10 15 8 390 190 3850 8.5 70
# # ... with 382 more rows, and 2 more variables: origin <dbl>, name <fct>
9. Check if the train dataset and the test dataset add up to the
original dataset
all.equal(dataset, bind_rows(dataset.train, dataset.test))
# TRUE
10. Save and check the splitted train dataset and test dataset
Remove the test dataset until we finish the modeling~!
#@ Save and check the splitted train dataset and test dataset. ----
saveRDS(dataset.train, "dataset.train.rds")
saveRDS(dataset.test, "dataset.test.rds")
#@ You may also export to MS Excel format to check the data. ----
write.xlsx(dataset.train, "dataset.train.xlsx", asTable = T)
write.xlsx(dataset.test, "dataset.test.xlsx", asTable = T)
openXL("dataset.train.xlsx")
openXL("dataset.test.xlsx")
#@ Remove the original dataset and the test dataset (to make it unseen until the model is fitted). ----
rm(dataset)
rm(dataset.test)
11.
12. Visualize mpg vs. horsepower
#@ Visualize mpg vs. horsepower ----
# Tools -> Global Options -> R Markdown -> Show output inline for all R Markdown documents
# Tools -> Global Options -> R Markdown -> "Show output preview in" -> select "Viewer Pane"
dataset.train %>% ggplot(aes(x = horsepower, y = mpg)) + geom_point()
13. Visualize mpg vs. horsepower
dataset.train %>% ggplot(aes(x = horsepower, y = mpg)) + geom_point() + geom_smooth(method = "lm")
14. Regress mpg vs. horsepower
#@ Regress mpg vs. horsepower ----
model1 = lm(mpg ~ horsepower, data = dataset.train)
model1
# Call:
# lm(formula = mpg ~ horsepower, data = dataset.train)
#
# Coefficients:
# (Intercept) horsepower
# 40.3404 -0.1617
18. Calculate the training mean squared error
dataset.train %>% ggplot(aes(x = horsepower, y = mpg)) + geom_point() + geom_smooth(method = "lm") +
geom_abline(intercept = 50, slope = -0.3, color = "red")
19. Calculate the training mean squared error
dataset.train$mpg.tmp.residual %>% mean
dataset.train$mpg.tmp.residual^2 %>% mean
# [1] -3.697654e-15
# [1] 21.78987
dataset.train$mpg.tmp.residual^2 %>% mean
dataset.train$mpg.tmp2.residual^2 %>% mean
# [1] 21.78987
# [1] 76.1948
29. Fit multiple models using for-loop
#@ Fit multiple models using for-loop, and then save the models as R list of objects. =====
model.list = list()
dataset.train = readRDS("dataset.train.rds")
dataset = as.data.frame(dataset.train)
for (i in 1:5) {
myformula = as.formula(paste0("mpg ~ poly(horsepower, ", i, ")"))
model.list[[i]] = glm(myformula, data = dataset)
}
30. Calculate the training MSE & test MSE for multiple models
#@ Loading the test dataset (after modeling is finished~!) -----
dataset.test = readRDS("dataset.test.rds")
#@ Define the loss function (optimization objective). -----
# Cf) You may define any function to avoid repetitive codes.
MSE = function(y,yhat) mean((y-yhat)^2)
# Make a table that shows the training error and test error for each model in the model.list. -----
df = data.frame(
i = 1:length(model.list)
, trainMSE = model.list %>% map_dbl(function(object) MSE(object$y, predict(object)))
, testMSE = model.list %>% map_dbl(function(object) MSE(dataset.test$mpg, predict(object, newdata =
dataset.test)))
)
df
# i trainMSE testMSE
# 1 1 21.78987 26.14142
# 2 2 18.29115 19.82259
# 3 3 18.27482 19.78252
# 4 4 18.12455 19.99969
# 5 5 17.45436 20.18225
#@ Remove the test dataset (before any additional modeling~!) -----
rm(dataset.test)
38. K-fold "random" split of the training dataset
## Visual check of the distribution of the folds ----
dataset.train %>% ggplot(aes(x = horsepower, y = mpg, color = as.factor(fold.index))) + geom_point()
39.
40. Fit multiple models in each cross-validation folds
#@ Nested for-loop: (1) Iteration of folds for cross-validation (2) Fit multiple models using for-loop =====
# Save the models as a "nested" list of objects to save the results from "nested" for-loop.
max.polynomial = 10
cv.model.list = list()
for (i.fold in sort(unique(dataset.train$fold.index))) {
cv.model.list[[i.fold]] = list()
dataset = dataset.train %>% filter(fold.index != i.fold) %>% as.data.frame
for (i in 1:max.polynomial) {
myformula = as.formula(paste0("mpg ~ poly(horsepower, ", i, ")"))
cv.model.list[[i.fold]][[i]] = glm(myformula, data = dataset)
}
}
41. Calculate the training MSE & validation MSE for multiple
models in each cross-validation folds
#@ Define the loss function (optimization objective). -----
# Cf) You may define any function to avoid repetitive codes.
MSE = function(y,yhat) mean((y-yhat)^2)
# Make a table that shows the training error and test error for each cross-validation & each model in the "nested"
model.list. -----
cv.df = data_frame(
cv = rep(1:k, each = max.polynomial)
, polynomial = rep(1:max.polynomial, k)
) %>% mutate(
trainMSE = map2_dbl(cv, polynomial, function(i.fold, i) { cv.model.list[[i.fold]][[i]] %>% {MSE(.$y,
predict(.)) } })
, cvMSE = map2_dbl(cv, polynomial, function(i.fold, i) { MSE(dataset.train %>% filter(fold.index == i.fold) %>%
select(mpg) %>% unlist, predict(cv.model.list[[i.fold]][[i]], newdata = dataset.train %>% filter(fold.index ==
i.fold))) } )
)
cv.df
# # A tibble: 50 x 4
# cv polynomial trainMSE cvMSE
# <int> <int> <dbl> <dbl>
# 1 1 1 22.5 19.3
# 2 1 2 18.9 16.3
# 3 1 3 18.9 16.3
# 4 1 4 18.5 17.4
# 5 1 5 17.6 17.4
# 6 1 6 17.4 16.9
# 7 1 7 17.2 16.3
# 8 1 8 17.0 17.9
# 9 1 9 17.0 17.7
# 10 1 10 17.0 17.6
# # ... with 40 more rows
42. Calculate the (aggregated) training MSE & (aggregated) cv MSE
for multiple models
# Make a table that shows the (aggregated) training error and test error for each model -----
cv.df.summarize = cv.df %>% select(-cv) %>% group_by(polynomial) %>% summarize_all(mean)
cv.df.summarize
# # A tibble: 10 x 3
# polynomial trainMSE cvMSE
# <int> <dbl> <dbl>
# 1 1 21.7 22.5
# 2 2 18.2 18.8
# 3 3 18.2 18.9
# 4 4 18.0 19.1
# 5 5 17.4 18.2
# 6 6 17.1 18.0
# 7 7 16.7 18.7
# 8 8 16.7 18.8
# 9 9 16.6 19.3
# 10 10 16.6 18.5
43. Visualize the (aggregated) training MSE & cv MSE for multiple
models
cv.df.summarize %>% gather(key, value, trainMSE, cvMSE) %>% ggplot(aes(x = polynomial, y = value, color = key))
+ geom_point() + geom_line()