Install required R software packages
for (packagename in c("tidyverse", "openxlsx")) {
if(!require(packagename, character.only = T)) {install.packages(packagename);
require(packagename, character.only = T)}
Imputation for the training dataset
#@ Load only the training data. Make sure the test data is not loaded before modeling is finished. ----
# If the test data is not sequestered, make a random split and save them first. Then only load the training
data, so that the test data remain unseen before modeling is finished. ----
dataset.train = readRDS(url(""))
# Here, we will use a single regression imputation for simplicity. However, multiple imputation is recommended.
imputation.model = glm(bmi1 ~ poly(age1, 2) + sex1, data = dataset.train)
dataset.train = dataset.train %>% mutate(
bmi1.old = bmi1
, bmi1.is_imputed = bmi1 %>% {ifelse(, T, F)}
, bmi1 = bmi1 %>% {ifelse(, predict(imputation.model), .)}
dataset.train %>% select(randid, death, age1, sex1, matches("bmi1")) %>% filter(bmi1.is_imputed)
# # A tibble: 2 x 7
# randid death age1 sex1 bmi1 bmi1.old bmi1.is_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 1600765 0 45 2 26.9 NA TRUE
# 2 6921140 1 64 1 26.3 NA TRUE
Visualize the training dataset with the labels
dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = death)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
Logistic model (1)
model1 = glm(death ~ poly(age1, 1, raw = T) + poly(bmi1, 1, raw = T), family = "binomial", data =
model1 %>% summary #----
# Call:
# glm(formula = death ~ poly(age1, 1, raw = T) + poly(bmi1, 1,
# raw = T), family = "binomial", data = dataset.train)
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -1.8968 -0.7632 -0.4850 0.8773 2.5501
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -8.51201 1.06490 -7.993 1.31e-15 ***
# poly(age1, 1, raw = T) 0.13326 0.01533 8.696 < 2e-16 ***
# poly(bmi1, 1, raw = T) 0.03576 0.02895 1.235 0.217
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# (Dispersion parameter for binomial family taken to be 1)
# Null deviance: 568.61 on 449 degrees of freedom
# Residual deviance: 465.63 on 447 degrees of freedom
# AIC: 471.63
# Number of Fisher Scoring iterations: 4
dataset.train = dataset.train %>%
mutate(death.model1.predict.prob = predict(model1, type = "response", newdata = .))
Visualize the logistic model (1) with the fitted probability
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model1.predict = predict(model1, type = "response")) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
The estimated probability can be thresholded
(dichotomized) for a binary classification.
Visualize the logistic model (1) with a cutoff of mean
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model1.predict = predict(model1, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
Visualize the logistic model (1) with a cutoff of 0.5
cutoff.value = 0.5
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model1.predict = predict(model1, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
Logistic model (2)
model2 = glm(death ~ poly(age1, 2, raw = T) + poly(bmi1, 2, raw = T), family = "binomial", data = dataset.train)
model2 %>% summary #----
# Call:
# glm(formula = death ~ poly(age1, 2, raw = T) + poly(bmi1, 2,
# raw = T), family = "binomial", data = dataset.train)
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -2.2106 -0.7227 -0.5243 0.8057 2.1919
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) 3.806304 5.819063 0.654 0.5130
# poly(age1, 2, raw = T)1 -0.206718 0.198164 -1.043 0.2969
# poly(age1, 2, raw = T)2 0.003295 0.001924 1.713 0.0868 .
# poly(bmi1, 2, raw = T)1 -0.247094 0.251485 -0.983 0.3258
# poly(bmi1, 2, raw = T)2 0.005231 0.004555 1.148 0.2508
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# (Dispersion parameter for binomial family taken to be 1)
# Null deviance: 568.61 on 449 degrees of freedom
# Residual deviance: 461.12 on 445 degrees of freedom
# AIC: 471.12
# Number of Fisher Scoring iterations: 4
dataset.train = dataset.train %>%
mutate(death.model2.predict.prob = predict(model2, type = "response"))
Visualize the logistic model (2) with the fitted probability
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model2.predict = predict(model2, type = "response")) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
Visualize the logistic model (2) with a cutoff of mean
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model2.predict = predict(model2, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
Visualize the logistic model (2) with a cutoff of 0.5
cutoff.value = 0.5
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model2.predict = predict(model2, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
Performance of a binary classification test:
Sensitivity, specificity, PPV, NPV, ...
Receiver operating characteristic (ROC)
• Let a conditions random variable X denote the probability estimated for the
• Given a threshold parameter T,
• The observation is classified as "positive" if X > T, and "negative" otherwise.
• X follows a probability density f1(x) if the instance actually belongs to class "positive", and
f0(x) if otherwise.
• Therefore,
• the true positive rate TPR(T) = ‫׬‬𝑇
𝑓1 𝑥 𝑑𝑥
• and the false positive rate FPR(T) = ‫׬‬𝑇
𝑓0 𝑥 𝑑𝑥
• The ROC curve plots parametrically TPR(T) versus FPR(T)
Area under the ROC curve (AUROC)
• The ROC curve plots parametrically TPR(T) versus FPR(T)
• Area under the ROC curve (AUROC)
• When using normalized units, the AUROC is equal to the probability that a classifier will
rank a randomly chosen positive instance higher than a randomly chosen negative one
(assuming 'positive' ranks higher than 'negative').
• TPR(T): T -> y(x)
• FPR(T): T -> x
• Large T corresponds to a lower value of x
𝑇𝑃𝑅 𝐹𝑃𝑅−1 𝑥 𝑑𝑥 = න
= න
𝐼 𝑇′ > 𝑇 𝑓1 𝑇′ 𝑓0 𝑇 𝑑𝑇′𝑑𝑇 = 𝑃 𝑋1 > 𝑋0 ,
where X1 is the estimated probability for a positive observation, X0 is the estimated
probability for a negative observation, X follows a probability density f1(x) if the instance
actually belongs to class "positive", and f0(x) if otherwise.
Calculate the training AUROC
#@ functions for calculating AUROC ====
function.vec_actual_prediction.threshold_roc = function(vec_actual, vec_prediction) {
out = tibble(threshold = vec_prediction %>% unique %>% sort(decreasing = F) %>% {(. + lag(.))/2} %>%
replace_na(-Inf) %>% {c(., Inf)} ) %>%
TP = threshold %>% map_dbl(function(i) {sum(vec_actual == T & vec_prediction >= i)})
, FP = threshold %>% map_dbl(function(i) {sum(vec_actual != T & vec_prediction >= i)})
, FN = threshold %>% map_dbl(function(i) {sum(vec_actual == T & vec_prediction < i)})
, TN = threshold %>% map_dbl(function(i) {sum(vec_actual != T & vec_prediction < i)})
, Sensitivity = TP/(TP+FN)
, Specificity = TN/(TN+FP)
function.threshold_roc.auc = function(object.threshold_roc) {
tmp_df = object.threshold_roc %>%
TPR = Sensitivity
, FPR = 1 - Specificity
) %>%
arrange(FPR, TPR) %>%
dFPR = c(diff(FPR), 0)
, dTPR = c(diff(TPR), 0)
# inputs already sorted, best predictions first
tmp_df %>% with(sum(TPR * dFPR) + sum(dTPR * dFPR)/2)
Calculate the training AUROC (1)
dataset.train.threshold_roc.model1 = function.vec_actual_prediction.threshold_roc(dataset.train$death,
dataset.train.threshold_roc.model1 %>% as.tibble
# # A tibble: 449 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 147 303 0 0 1 0
# 2 0.0388 146 303 1 0 0.993 0
# 3 0.0406 146 302 1 1 0.993 0.00330
# 4 0.0456 146 301 1 2 0.993 0.00660
# 5 0.0495 146 300 1 3 0.993 0.00990
# 6 0.0506 146 299 1 4 0.993 0.0132
# 7 0.0521 146 298 1 5 0.993 0.0165
# 8 0.0540 145 298 2 5 0.986 0.0165
# 9 0.0554 145 297 2 6 0.986 0.0198
# 10 0.0566 145 296 2 7 0.986 0.0231
# # ... with 439 more rows
dataset.train.threshold_roc.model1 %>% function.threshold_roc.auc
# [1] 0.7788554
Calculate the training AUROC (2)
dataset.train.threshold_roc.model2 = function.vec_actual_prediction.threshold_roc(dataset.train$death,
dataset.train.threshold_roc.model2 %>% as.tibble
# # A tibble: 449 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 147 303 0 0 1 0
# 2 0.0917 146 303 1 0 0.993 0
# 3 0.0931 146 302 1 1 0.993 0.00330
# 4 0.0939 146 301 1 2 0.993 0.00660
# 5 0.0944 146 300 1 3 0.993 0.00990
# 6 0.0948 146 299 1 4 0.993 0.0132
# 7 0.0953 146 298 1 5 0.993 0.0165
# 8 0.0955 146 297 1 6 0.993 0.0198
# 9 0.0956 146 296 1 7 0.993 0.0231
# 10 0.0958 146 295 1 8 0.993 0.0264
# # ... with 439 more rows
dataset.train.threshold_roc.model2 %>% function.threshold_roc.auc
# [1] 0.7787656
Imputation for the test dataset
#@ Loading the test dataset (after modeling is finished~!) -----
dataset.test = readRDS(url(""))
# #@ imputation.model = glm(bmi1 ~ poly(age1, 2) + sex1, data = dataset.train) ====
dataset.test = dataset.test %>% mutate(
bmi1.old = bmi1
, bmi1.is_imputed = bmi1 %>% {ifelse(, T, F)}
, bmi1 = bmi1 %>% {ifelse(, predict(imputation.model, newdata = dataset.test), .)}
dataset.test %>% select(randid, death, age1, sex1, matches("bmi1")) %>% filter(bmi1.is_imputed)
# # A tibble: 2 x 7
# randid death age1 sex1 bmi1 bmi1.old bmi1.is_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 2668575 1 64 2 26.2 NA TRUE
# 2 6026757 0 39 2 24.1 NA TRUE
Calculate the test AUROC
#@ predict using dataset.test ----
dataset.test = dataset.test %>%
mutate(death.model1.predict.prob = predict(model1, type = "response", newdata = .))
dataset.test = dataset.test %>%
mutate(death.model2.predict.prob = predict(model2, type = "response", newdata = .))
dataset.test %>% select(randid, death, age1, sex1, bmi1, matches("predict.prob"))
# # A tibble: 450 x 7
# randid death age1 sex1 bmi1 death.model1.predict.p~ death.model2.predict.~
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 16799 0 50 2 22.9 0.263 0.230
# 2 69134 1 59 2 20.8 0.523 0.551
# 3 97895 1 65 2 30.5 0.776 0.835
# 4 110542 1 63 2 27.1 0.701 0.733
# 5 170881 1 63 2 29.4 0.718 0.753
# 6 192229 1 39 1 32.5 0.104 0.148
# 7 209115 1 60 1 28.6 0.624 0.617
# 8 309808 0 43 1 25.5 0.134 0.131
# 9 388279 0 39 1 28.3 0.0910 0.115
# 10 431963 0 48 1 24.1 0.222 0.192
# # ... with 440 more rows
Calculate the test AUROC
dataset.test.threshold_roc.model1 = function.vec_actual_prediction.threshold_roc(dataset.test$death,
dataset.test.threshold_roc.model1 %>% as.tibble
# # A tibble: 447 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 159 291 0 0 1 0
# 2 0.0414 159 290 0 1 1 0.00344
# 3 0.0434 159 289 0 2 1 0.00687
# 4 0.0466 159 288 0 3 1 0.0103
# 5 0.0489 158 288 1 3 0.994 0.0103
# 6 0.0509 158 287 1 4 0.994 0.0137
# 7 0.0517 158 286 1 5 0.994 0.0172
# 8 0.0524 158 285 1 6 0.994 0.0206
# 9 0.0531 158 284 1 7 0.994 0.0241
# 10 0.0532 158 283 1 8 0.994 0.0275
# # ... with 437 more rows
dataset.test.threshold_roc.model1 %>% function.threshold_roc.auc
# [1] 0.7603255
Calculate the test AUROC (2)
dataset.test.threshold_roc.model2 = function.vec_actual_prediction.threshold_roc(dataset.test$death,
dataset.test.threshold_roc.model2 %>% as.tibble
# # A tibble: 447 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 159 291 0 0 1 0
# 2 0.0898 159 290 0 1 1 0.00344
# 3 0.0920 159 289 0 2 1 0.00687
# 4 0.0932 159 288 0 3 1 0.0103
# 5 0.0935 159 287 0 4 1 0.0137
# 6 0.0942 159 286 0 5 1 0.0172
# 7 0.0953 159 285 0 6 1 0.0206
# 8 0.0959 159 284 0 7 1 0.0241
# 9 0.0964 159 283 0 8 1 0.0275
# 10 0.0971 159 282 0 9 1 0.0309
# # ... with 437 more rows
dataset.test.threshold_roc.model2 %>% function.threshold_roc.auc
# [1] 0.7608442
ROC curves overlaid
dataset.test.threshold_roc.model1 %>% arrange(1 - Specificity, Sensitivity) %>%
ggplot(aes(x = 1 - Specificity, y = Sensitivity)) +
geom_line(color = "red") +
geom_line(color = "blue", data = dataset.test.threshold_roc.model2 %>% arrange(1 - Specificity, Sensitivity)) +
coord_cartesian(xlim = c(0,1), ylim = c(0,1))
Fit multiple models using for-loop
#@ Fit multiple models using for-loop, and then save the models as R list of objects. =====
model.list = list()
for (i in 1:10) {
myformula = as.formula(paste0("death ~ poly(age1, ", i, ")", " + ", "poly(bmi1, ", i, ")"))
model.list[[i]] = glm(myformula, data = dataset.train, family = "binomial")
Calculate the training AUROC & test AUROC for multiple
# Make a table that shows the training AUROC and test AUROC for each model in the model.list. -----
df = data.frame(
i = 1:length(model.list)
, trainAUROC = model.list %>% map_dbl(function(model.object) {
dataset.train %>% {
function.vec_actual_prediction.threshold_roc(.$death, predict(model.object, type = "response", newdata = .))
} %>% function.threshold_roc.auc
, testAUROC = model.list %>% map_dbl(function(model.object) {
dataset.test %>% {
function.vec_actual_prediction.threshold_roc(.$death, predict(model.object, type = "response", newdata = .))
} %>% function.threshold_roc.auc
# i trainAUROC testAUROC
# 1 1 0.7788554 0.7603255
# 2 2 0.7787656 0.7608442
# 3 3 0.7779349 0.7668525
# 4 4 0.7819762 0.7646480
# 5 5 0.7822680 0.7607577
# 6 6 0.7821109 0.7607577
# 7 7 0.7861296 0.7568242
# 8 8 0.7820884 0.7510752
# 9 9 0.7829640 0.7449156
# 10 10 0.8003413 0.7516156
#@ Remove the test dataset (before any additional modeling~!) -----
Calculate the training AUROC & test AUROC for multiple
df %>% gather(key, value, trainAUROC, testAUROC) %>% ggplot(aes(x = i, y = value, color = key)) + geom_point()
+ geom_line()
Visualize the training dataset with the labels
dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = death)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
Visualize the logistic model (10) with a cutoff of mean
i = 10
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (9) with a cutoff of mean
i = 9
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (8) with a cutoff of mean
i = 8
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (5) with a cutoff of mean
i = 5
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (3) with a cutoff of mean
i = 3
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (2) with a cutoff of mean
i = 2
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (1) with a cutoff of mean
i = 1
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
K-fold "random" split of the training dataset
#@ K-fold "random" split of the training dataset -----
function.vec.fold.index = function(data, k = 5) data %>% { rep(1:k, (nrow(.) %/% k) + 1) [1:nrow(.)] }
dataset.train %>% function.vec.fold.index(k = 5) %>% dput
set.seed(12345); dataset.train %>% function.vec.fold.index(k = 5) %>% sample %>% dput
# c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L)
# c(5L, 4L, 1L, 2L, 4L, 5L, 5L, 1L, 2L, 2L, 1L, 2L, 3L, 1L, 1L,
# 2L, 4L, 5L, 3L, 1L, 1L, 1L, 4L, 3L, 5L, 1L, 2L, 1L, 1L, 1L, 4L,
# 3L, 4L, 5L, 4L, 1L, 5L, 4L, 5L, 1L, 1L, 1L, 4L, 5L, 1L, 1L, 5L,
# 3L, 3L, 1L, 1L, 1L, 1L, 5L, 1L, 3L, 3L, 2L, 1L, 3L, 4L, 1L, 3L,
# 3L, 3L, 2L, 5L, 3L, 5L, 1L, 2L, 2L, 4L, 2L, 4L, 3L, 3L, 5L, 5L,
# 1L, 3L, 2L, 4L, 3L, 3L, 2L, 1L, 3L, 1L, 2L, 5L, 2L, 4L, 5L, 2L,
# 3L, 1L, 5L, 4L, 5L, 4L, 1L, 5L, 2L, 1L, 2L, 5L, 4L, 1L, 4L, 3L,
# 5L, 1L, 3L, 3L, 4L, 4L, 5L, 3L, 2L, 5L, 5L, 5L, 4L, 2L, 1L, 1L,
# 1L, 5L, 4L, 1L, 2L, 4L, 1L, 4L, 1L, 4L, 3L, 1L, 2L, 2L, 4L, 4L,
# 4L, 5L, 2L, 4L, 5L, 3L, 3L, 5L, 3L, 3L, 5L, 2L, 1L, 1L, 2L, 3L,
# 4L, 5L, 2L, 1L, 4L, 4L, 3L, 3L, 5L, 1L, 4L, 2L, 1L, 4L, 2L, 3L,
# 2L, 3L, 1L, 2L, 5L, 1L, 1L, 4L, 5L, 3L, 4L, 2L, 2L, 3L, 1L, 2L,
# 2L, 5L, 3L, 3L, 2L, 2L, 5L, 1L, 1L, 3L, 3L, 5L, 5L, 4L, 3L, 2L,
# 2L, 5L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 5L, 1L, 5L, 1L, 4L, 1L,
# 2L, 3L, 5L, 1L, 3L, 5L, 3L, 3L, 3L, 2L, 2L, 4L, 1L, 2L, 3L, 4L,
# 1L, 5L, 5L, 4L, 3L, 4L, 4L, 5L, 1L, 4L, 3L, 4L, 1L, 2L, 2L, 4L,
# 5L, 2L, 1L, 4L, 4L, 1L, 1L, 3L, 3L, 5L, 5L, 3L, 5L, 2L, 4L, 2L,
# 2L, 3L, 2L, 2L, 3L, 2L, 1L, 3L, 2L, 4L, 4L, 3L, 2L, 5L, 4L, 5L,
# 1L, 3L, 4L, 3L, 1L, 5L, 1L, 4L, 4L, 1L, 3L, 5L, 4L, 2L, 3L, 3L,
# 2L, 5L, 4L, 2L, 2L, 5L, 3L, 4L, 5L, 4L, 4L, 4L, 5L, 1L, 2L, 5L,
# 4L, 5L, 2L, 2L, 5L, 1L, 1L, 3L, 4L, 5L, 3L, 2L, 5L, 4L, 3L, 3L,
# 2L, 5L, 5L, 5L, 5L, 4L, 1L, 4L, 4L, 5L, 1L, 1L, 4L, 5L, 2L, 3L,
# 2L, 3L, 3L, 3L, 4L, 1L, 4L, 4L, 5L, 1L, 5L, 5L, 5L, 3L, 5L, 3L,
# 3L, 2L, 2L, 3L, 2L, 3L, 2L, 1L, 3L, 3L, 1L, 2L, 5L, 3L, 4L, 1L,
# 5L, 3L, 5L, 4L, 2L, 1L, 3L, 2L, 1L, 4L, 3L, 1L, 4L, 3L, 5L, 5L,
# 5L, 4L, 4L, 1L, 1L, 5L, 5L, 3L, 2L, 3L, 5L, 2L, 3L, 5L, 1L, 5L,
# 4L, 2L, 5L, 5L, 1L, 1L, 4L, 3L, 4L, 2L, 1L, 4L, 3L, 4L, 4L, 4L,
# 5L, 3L, 5L, 2L, 4L, 1L, 4L, 3L, 2L, 4L, 4L, 1L, 4L, 2L, 2L, 2L,
# 2L, 2L, 4L)
K-fold "random" split of the training dataset
#@ K-fold split of the training dataset -----
dataset.train = dataset.train %>% rownames_to_column
# Do not forget to set the random seed, before performing any randomization tasks (e.g., random sampling).
set.seed(12345); dataset.train$fold.index = dataset.train %>% function.vec.fold.index(k = 5) %>% sample
dataset.train %>% select(rowname, death, age1, bmi1, fold.index)
# # A tibble: 450 x 5
# rowname death age1 bmi1 fold.index
# <chr> <dbl> <dbl> <dbl> <int>
# 1 1 1 39 22.4 5
# 2 2 1 47 24.2 4
# 3 3 1 52 40.1 1
# 4 4 0 42 28.9 2
# 5 5 1 53 21.5 4
# 6 6 0 47 19.7 5
# 7 7 0 56 23.6 5
# 8 8 0 41 30.6 1
# 9 9 0 53 18.2 2
# 10 10 0 46 20.2 2
# # ... with 440 more rows
K-fold "random" split of the training dataset
dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index != 1)
dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index == 1)
# > dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index != 1)
# # A tibble: 360 x 5
# rowname death age1 bmi1 fold.index
# <chr> <dbl> <dbl> <dbl> <int>
# 1 1 1 39 22.4 5
# 2 2 1 47 24.2 4
# 3 4 0 42 28.9 2
# 4 5 1 53 21.5 4
# 5 6 0 47 19.7 5
# 6 7 0 56 23.6 5
# 7 9 0 53 18.2 2
# 8 10 0 46 20.2 2
# 9 12 0 40 26.8 2
# 10 13 1 56 33.0 3
# # ... with 350 more rows
# > dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index == 1)
# # A tibble: 90 x 5
# rowname death age1 bmi1 fold.index
# <chr> <dbl> <dbl> <dbl> <int>
# 1 3 1 52 40.1 1
# 2 8 0 41 30.6 1
# 3 11 0 41 20.5 1
# 4 14 1 47 22.0 1
# 5 15 0 38 22.8 1
# 6 20 1 56 30.8 1
# 7 21 0 60 27.3 1
# 8 22 0 64 27.5 1
# 9 26 0 39 31.0 1
# 10 28 0 55 27.8 1
# # ... with 80 more rows
K-fold "random" split of the training dataset
## Visual check of the distribution of the folds ----
dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = as.factor(fold.index))) + geom_point()
Fit multiple models in each cross-validation folds
#@ Nested for-loop: (1) Iteration of folds for cross-validation (2) Fit multiple models using for-loop =====
# Save the models as a "nested" list of objects to save the results from "nested" for-loop.
max.polynomial = 5
cv.model.list = list()
for (i.fold in sort(unique(dataset.train$fold.index))) {
cv.model.list[[i.fold]] = list()
dataset = dataset.train %>% filter(fold.index != i.fold) %>%
for (i in 1:max.polynomial) {
myformula = as.formula(paste0("death ~ poly(age1, ", i, ")", " + ", "poly(bmi1, ", i, ")"))
cv.model.list[[i.fold]][[i]] = glm(myformula, data = dataset, family = "binomial")
Calculate the training AUROC & validation AUROC for multiple
models in each cross-validation folds
#@ Define the loss function (optimization objective). -----
# Cf) You may define any function to avoid repetitive codes.
AUROC = function(y,yhat) mean((y-yhat)^2)
# Make a table that shows the training AUROC and test AUROC for each cross-validation & each model in the "nested"
model.list. -----
cv.df = data_frame(
cv = rep(1:k, each = max.polynomial)
, polynomial = rep(1:max.polynomial, k)
) %>% mutate(
trainAUROC = map2_dbl(cv, polynomial, function(i.fold, i) { cv.model.list[[i.fold]][[i]] %>% {AUROC(.$y,
predict(.)) } })
, cvAUROC = map2_dbl(cv, polynomial, function(i.fold, i) { AUROC(dataset.train %>% filter(fold.index ==
i.fold) %>% select(mpg) %>% unlist, predict(cv.model.list[[i.fold]][[i]], newdata = dataset.train %>%
filter(fold.index == i.fold))) } )
# # A tibble: 50 x 4
# cv polynomial trainAUROC cvAUROC
# <int> <int> <dbl> <dbl>
# 1 1 1 22.5 19.3
# 2 1 2 18.9 16.3
# 3 1 3 18.9 16.3
# 4 1 4 18.5 17.4
# 5 1 5 17.6 17.4
# 6 1 6 17.4 16.9
# 7 1 7 17.2 16.3
# 8 1 8 17.0 17.9
# 9 1 9 17.0 17.7
# 10 1 10 17.0 17.6
# # ... with 40 more rows
Calculate the (aggregated) training AUROC & (aggregated) cv
AUROC for multiple models
# Make a table that shows the (aggregated) training error and test error for each model -----
cv.df.summarize = cv.df %>% select(-cv) %>% group_by(polynomial) %>% summarize_all(mean)
# # A tibble: 10 x 3
# polynomial trainAUROC cvAUROC
# <int> <dbl> <dbl>
# 1 1 0.779 0.782
# 2 2 0.779 0.783
# 3 3 0.779 0.780
# 4 4 0.783 0.777
# 5 5 0.783 0.772
# 6 6 0.786 0.763
# 7 7 0.789 0.758
# 8 8 0.787 0.753
# 9 9 0.787 0.749
# 10 10 0.803 0.761
Visualize the (aggregated) training AUROC & cv AUROC for
multiple models
cv.df.summarize %>% gather(key, value, trainAUROC, cvAUROC) %>% ggplot(aes(x = polynomial, y = value, color =
key)) + geom_point() + geom_line()
MH prediction modeling and validation in r (2) classification 190709

