SlideShare a Scribd company logo
Install required R software packages
for (packagename in c("tidyverse", "openxlsx")) {
if(!require(packagename, character.only = T)) {install.packages(packagename);
require(packagename, character.only = T)}
}
Imputation for the training dataset
#@ Load only the training data. Make sure the test data is not loaded before modeling is finished. ----
# If the test data is not sequestered, make a random split and save them first. Then only load the training
data, so that the test data remain unseen before modeling is finished. ----
dataset.train = readRDS(url("https://github.com/mkim0710/PH207x/blob/master/fhs.index100le10.rds?raw=true"))
# Here, we will use a single regression imputation for simplicity. However, multiple imputation is recommended.
imputation.model = glm(bmi1 ~ poly(age1, 2) + sex1, data = dataset.train)
dataset.train = dataset.train %>% mutate(
bmi1.old = bmi1
, bmi1.is_imputed = bmi1 %>% {ifelse(is.na(.), T, F)}
, bmi1 = bmi1 %>% {ifelse(is.na(.), predict(imputation.model), .)}
)
dataset.train %>% select(randid, death, age1, sex1, matches("bmi1")) %>% filter(bmi1.is_imputed)
# # A tibble: 2 x 7
# randid death age1 sex1 bmi1 bmi1.old bmi1.is_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 1600765 0 45 2 26.9 NA TRUE
# 2 6921140 1 64 1 26.3 NA TRUE
Visualize the training dataset with the labels
dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = death)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
theme_minimal()
Logistic model (1)
model1 = glm(death ~ poly(age1, 1, raw = T) + poly(bmi1, 1, raw = T), family = "binomial", data =
dataset.train)
model1 %>% summary #----
# Call:
# glm(formula = death ~ poly(age1, 1, raw = T) + poly(bmi1, 1,
# raw = T), family = "binomial", data = dataset.train)
#
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -1.8968 -0.7632 -0.4850 0.8773 2.5501
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -8.51201 1.06490 -7.993 1.31e-15 ***
# poly(age1, 1, raw = T) 0.13326 0.01533 8.696 < 2e-16 ***
# poly(bmi1, 1, raw = T) 0.03576 0.02895 1.235 0.217
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# (Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 568.61 on 449 degrees of freedom
# Residual deviance: 465.63 on 447 degrees of freedom
# AIC: 471.63
#
# Number of Fisher Scoring iterations: 4
dataset.train = dataset.train %>%
mutate(death.model1.predict.prob = predict(model1, type = "response", newdata = .))
Visualize the logistic model (1) with the fitted probability
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model1.predict = predict(model1, type = "response")) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
theme_minimal()
The estimated probability can be thresholded
(dichotomized) for a binary classification.
https://github.com/kenhktsui/Visualizing-Logistic-Regression
Visualize the logistic model (1) with a cutoff of mean
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model1.predict = predict(model1, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
theme_minimal()
Visualize the logistic model (1) with a cutoff of 0.5
cutoff.value = 0.5
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model1.predict = predict(model1, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
theme_minimal()
Logistic model (2)
model2 = glm(death ~ poly(age1, 2, raw = T) + poly(bmi1, 2, raw = T), family = "binomial", data = dataset.train)
model2 %>% summary #----
# Call:
# glm(formula = death ~ poly(age1, 2, raw = T) + poly(bmi1, 2,
# raw = T), family = "binomial", data = dataset.train)
#
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -2.2106 -0.7227 -0.5243 0.8057 2.1919
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) 3.806304 5.819063 0.654 0.5130
# poly(age1, 2, raw = T)1 -0.206718 0.198164 -1.043 0.2969
# poly(age1, 2, raw = T)2 0.003295 0.001924 1.713 0.0868 .
# poly(bmi1, 2, raw = T)1 -0.247094 0.251485 -0.983 0.3258
# poly(bmi1, 2, raw = T)2 0.005231 0.004555 1.148 0.2508
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# (Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 568.61 on 449 degrees of freedom
# Residual deviance: 461.12 on 445 degrees of freedom
# AIC: 471.12
#
# Number of Fisher Scoring iterations: 4
dataset.train = dataset.train %>%
mutate(death.model2.predict.prob = predict(model2, type = "response"))
Visualize the logistic model (2) with the fitted probability
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model2.predict = predict(model2, type = "response")) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
theme_minimal()
Visualize the logistic model (2) with a cutoff of mean
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model2.predict = predict(model2, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
theme_minimal()
Visualize the logistic model (2) with a cutoff of 0.5
cutoff.value = 0.5
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.model2.predict = predict(model2, type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) +
theme_minimal()
Performance of a binary classification test:
Sensitivity, specificity, PPV, NPV, ...
https://en.wikipedia.org/wiki/Receiver_operating_characteristic
Receiver operating characteristic (ROC)
https://en.wikipedia.org/wiki/Receiver_operating_characteristic
• Let a conditions random variable X denote the probability estimated for the
observation.
• Given a threshold parameter T,
• The observation is classified as "positive" if X > T, and "negative" otherwise.
• X follows a probability density f1(x) if the instance actually belongs to class "positive", and
f0(x) if otherwise.
• Therefore,
• the true positive rate TPR(T) = ‫׬‬𝑇
∞
𝑓1 𝑥 𝑑𝑥
• and the false positive rate FPR(T) = ‫׬‬𝑇
∞
𝑓0 𝑥 𝑑𝑥
• The ROC curve plots parametrically TPR(T) versus FPR(T)
Area under the ROC curve (AUROC)
https://en.wikipedia.org/wiki/Receiver_operating_characteristic
• The ROC curve plots parametrically TPR(T) versus FPR(T)
• Area under the ROC curve (AUROC)
• When using normalized units, the AUROC is equal to the probability that a classifier will
rank a randomly chosen positive instance higher than a randomly chosen negative one
(assuming 'positive' ranks higher than 'negative').
• TPR(T): T -> y(x)
• FPR(T): T -> x
• Large T corresponds to a lower value of x
𝐴𝑈𝑅𝑂𝐶 = න
𝑥=0
1
𝑇𝑃𝑅 𝐹𝑃𝑅−1 𝑥 𝑑𝑥 = න
∞
−∞
𝑇𝑃𝑅 𝑇 𝐹𝑃𝑅′ 𝑇 𝑑𝑇
= න
−∞
∞
න
−∞
∞
𝐼 𝑇′ > 𝑇 𝑓1 𝑇′ 𝑓0 𝑇 𝑑𝑇′𝑑𝑇 = 𝑃 𝑋1 > 𝑋0 ,
where X1 is the estimated probability for a positive observation, X0 is the estimated
probability for a negative observation, X follows a probability density f1(x) if the instance
actually belongs to class "positive", and f0(x) if otherwise.
Calculate the training AUROC
#@ functions for calculating AUROC ====
function.vec_actual_prediction.threshold_roc = function(vec_actual, vec_prediction) {
out = tibble(threshold = vec_prediction %>% unique %>% sort(decreasing = F) %>% {(. + lag(.))/2} %>%
replace_na(-Inf) %>% {c(., Inf)} ) %>%
mutate(
TP = threshold %>% map_dbl(function(i) {sum(vec_actual == T & vec_prediction >= i)})
, FP = threshold %>% map_dbl(function(i) {sum(vec_actual != T & vec_prediction >= i)})
, FN = threshold %>% map_dbl(function(i) {sum(vec_actual == T & vec_prediction < i)})
, TN = threshold %>% map_dbl(function(i) {sum(vec_actual != T & vec_prediction < i)})
, Sensitivity = TP/(TP+FN)
, Specificity = TN/(TN+FP)
)
out
}
function.threshold_roc.auc = function(object.threshold_roc) {
tmp_df = object.threshold_roc %>%
mutate(
TPR = Sensitivity
, FPR = 1 - Specificity
) %>%
arrange(FPR, TPR) %>%
mutate(
dFPR = c(diff(FPR), 0)
, dTPR = c(diff(TPR), 0)
)
# inputs already sorted, best predictions first
tmp_df %>% with(sum(TPR * dFPR) + sum(dTPR * dFPR)/2)
}
Calculate the training AUROC (1)
dataset.train.threshold_roc.model1 = function.vec_actual_prediction.threshold_roc(dataset.train$death,
dataset.train$death.model1.predict.prob)
dataset.train.threshold_roc.model1 %>% as.tibble
# # A tibble: 449 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 147 303 0 0 1 0
# 2 0.0388 146 303 1 0 0.993 0
# 3 0.0406 146 302 1 1 0.993 0.00330
# 4 0.0456 146 301 1 2 0.993 0.00660
# 5 0.0495 146 300 1 3 0.993 0.00990
# 6 0.0506 146 299 1 4 0.993 0.0132
# 7 0.0521 146 298 1 5 0.993 0.0165
# 8 0.0540 145 298 2 5 0.986 0.0165
# 9 0.0554 145 297 2 6 0.986 0.0198
# 10 0.0566 145 296 2 7 0.986 0.0231
# # ... with 439 more rows
dataset.train.threshold_roc.model1 %>% function.threshold_roc.auc
# [1] 0.7788554
Calculate the training AUROC (2)
dataset.train.threshold_roc.model2 = function.vec_actual_prediction.threshold_roc(dataset.train$death,
dataset.train$death.model2.predict.prob)
dataset.train.threshold_roc.model2 %>% as.tibble
# # A tibble: 449 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 147 303 0 0 1 0
# 2 0.0917 146 303 1 0 0.993 0
# 3 0.0931 146 302 1 1 0.993 0.00330
# 4 0.0939 146 301 1 2 0.993 0.00660
# 5 0.0944 146 300 1 3 0.993 0.00990
# 6 0.0948 146 299 1 4 0.993 0.0132
# 7 0.0953 146 298 1 5 0.993 0.0165
# 8 0.0955 146 297 1 6 0.993 0.0198
# 9 0.0956 146 296 1 7 0.993 0.0231
# 10 0.0958 146 295 1 8 0.993 0.0264
# # ... with 439 more rows
dataset.train.threshold_roc.model2 %>% function.threshold_roc.auc
# [1] 0.7787656
Imputation for the test dataset
#@ Loading the test dataset (after modeling is finished~!) -----
dataset.test = readRDS(url("https://github.com/mkim0710/PH207x/blob/master/fhs.index100ge11le20.rds?raw=true"))
# #@ imputation.model = glm(bmi1 ~ poly(age1, 2) + sex1, data = dataset.train) ====
dataset.test = dataset.test %>% mutate(
bmi1.old = bmi1
, bmi1.is_imputed = bmi1 %>% {ifelse(is.na(.), T, F)}
, bmi1 = bmi1 %>% {ifelse(is.na(.), predict(imputation.model, newdata = dataset.test), .)}
)
dataset.test %>% select(randid, death, age1, sex1, matches("bmi1")) %>% filter(bmi1.is_imputed)
# # A tibble: 2 x 7
# randid death age1 sex1 bmi1 bmi1.old bmi1.is_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 2668575 1 64 2 26.2 NA TRUE
# 2 6026757 0 39 2 24.1 NA TRUE
Calculate the test AUROC
#@ predict using dataset.test ----
dataset.test = dataset.test %>%
mutate(death.model1.predict.prob = predict(model1, type = "response", newdata = .))
dataset.test = dataset.test %>%
mutate(death.model2.predict.prob = predict(model2, type = "response", newdata = .))
dataset.test %>% select(randid, death, age1, sex1, bmi1, matches("predict.prob"))
# # A tibble: 450 x 7
# randid death age1 sex1 bmi1 death.model1.predict.p~ death.model2.predict.~
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 16799 0 50 2 22.9 0.263 0.230
# 2 69134 1 59 2 20.8 0.523 0.551
# 3 97895 1 65 2 30.5 0.776 0.835
# 4 110542 1 63 2 27.1 0.701 0.733
# 5 170881 1 63 2 29.4 0.718 0.753
# 6 192229 1 39 1 32.5 0.104 0.148
# 7 209115 1 60 1 28.6 0.624 0.617
# 8 309808 0 43 1 25.5 0.134 0.131
# 9 388279 0 39 1 28.3 0.0910 0.115
# 10 431963 0 48 1 24.1 0.222 0.192
# # ... with 440 more rows
Calculate the test AUROC
dataset.test.threshold_roc.model1 = function.vec_actual_prediction.threshold_roc(dataset.test$death,
dataset.test$death.model1.predict.prob)
dataset.test.threshold_roc.model1 %>% as.tibble
# # A tibble: 447 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 159 291 0 0 1 0
# 2 0.0414 159 290 0 1 1 0.00344
# 3 0.0434 159 289 0 2 1 0.00687
# 4 0.0466 159 288 0 3 1 0.0103
# 5 0.0489 158 288 1 3 0.994 0.0103
# 6 0.0509 158 287 1 4 0.994 0.0137
# 7 0.0517 158 286 1 5 0.994 0.0172
# 8 0.0524 158 285 1 6 0.994 0.0206
# 9 0.0531 158 284 1 7 0.994 0.0241
# 10 0.0532 158 283 1 8 0.994 0.0275
# # ... with 437 more rows
dataset.test.threshold_roc.model1 %>% function.threshold_roc.auc
# [1] 0.7603255
Calculate the test AUROC (2)
dataset.test.threshold_roc.model2 = function.vec_actual_prediction.threshold_roc(dataset.test$death,
dataset.test$death.model2.predict.prob)
dataset.test.threshold_roc.model2 %>% as.tibble
# # A tibble: 447 x 7
# threshold TP FP FN TN Sensitivity Specificity
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -Inf 159 291 0 0 1 0
# 2 0.0898 159 290 0 1 1 0.00344
# 3 0.0920 159 289 0 2 1 0.00687
# 4 0.0932 159 288 0 3 1 0.0103
# 5 0.0935 159 287 0 4 1 0.0137
# 6 0.0942 159 286 0 5 1 0.0172
# 7 0.0953 159 285 0 6 1 0.0206
# 8 0.0959 159 284 0 7 1 0.0241
# 9 0.0964 159 283 0 8 1 0.0275
# 10 0.0971 159 282 0 9 1 0.0309
# # ... with 437 more rows
dataset.test.threshold_roc.model2 %>% function.threshold_roc.auc
# [1] 0.7608442
ROC curves overlaid
dataset.test.threshold_roc.model1 %>% arrange(1 - Specificity, Sensitivity) %>%
ggplot(aes(x = 1 - Specificity, y = Sensitivity)) +
geom_line(color = "red") +
geom_line(color = "blue", data = dataset.test.threshold_roc.model2 %>% arrange(1 - Specificity, Sensitivity)) +
coord_cartesian(xlim = c(0,1), ylim = c(0,1))
Fit multiple models using for-loop
#@ Fit multiple models using for-loop, and then save the models as R list of objects. =====
model.list = list()
for (i in 1:10) {
myformula = as.formula(paste0("death ~ poly(age1, ", i, ")", " + ", "poly(bmi1, ", i, ")"))
model.list[[i]] = glm(myformula, data = dataset.train, family = "binomial")
}
Calculate the training AUROC & test AUROC for multiple
models
# Make a table that shows the training AUROC and test AUROC for each model in the model.list. -----
df = data.frame(
i = 1:length(model.list)
, trainAUROC = model.list %>% map_dbl(function(model.object) {
dataset.train %>% {
function.vec_actual_prediction.threshold_roc(.$death, predict(model.object, type = "response", newdata = .))
} %>% function.threshold_roc.auc
})
, testAUROC = model.list %>% map_dbl(function(model.object) {
dataset.test %>% {
function.vec_actual_prediction.threshold_roc(.$death, predict(model.object, type = "response", newdata = .))
} %>% function.threshold_roc.auc
})
)
df
# i trainAUROC testAUROC
# 1 1 0.7788554 0.7603255
# 2 2 0.7787656 0.7608442
# 3 3 0.7779349 0.7668525
# 4 4 0.7819762 0.7646480
# 5 5 0.7822680 0.7607577
# 6 6 0.7821109 0.7607577
# 7 7 0.7861296 0.7568242
# 8 8 0.7820884 0.7510752
# 9 9 0.7829640 0.7449156
# 10 10 0.8003413 0.7516156
#@ Remove the test dataset (before any additional modeling~!) -----
rm(dataset.test)
Calculate the training AUROC & test AUROC for multiple
models
df %>% gather(key, value, trainAUROC, testAUROC) %>% ggplot(aes(x = i, y = value, color = key)) + geom_point()
+ geom_line()
Visualize the training dataset with the labels
dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = death)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") +
theme_minimal()
Visualize the logistic model (10) with a cutoff of mean
i = 10
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (9) with a cutoff of mean
i = 9
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (8) with a cutoff of mean
i = 8
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (5) with a cutoff of mean
i = 5
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (3) with a cutoff of mean
i = 3
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (2) with a cutoff of mean
i = 2
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
Visualize the logistic model (1) with a cutoff of mean
i = 1
cutoff.value = dataset.train$death %>% mean
dataset.train %>% select(randid, death, age1, sex1, bmi1) %>%
mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>%
ggplot(aes(x = age1, y = bmi1, color = death.predict)) +
geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") +
geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() +
labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
K-fold "random" split of the training dataset
#@ K-fold "random" split of the training dataset -----
function.vec.fold.index = function(data, k = 5) data %>% { rep(1:k, (nrow(.) %/% k) + 1) [1:nrow(.)] }
dataset.train %>% function.vec.fold.index(k = 5) %>% dput
set.seed(12345); dataset.train %>% function.vec.fold.index(k = 5) %>% sample %>% dput
# c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
# 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
# 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
# 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
# 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
# 3L, 4L, 5L)
# c(5L, 4L, 1L, 2L, 4L, 5L, 5L, 1L, 2L, 2L, 1L, 2L, 3L, 1L, 1L,
# 2L, 4L, 5L, 3L, 1L, 1L, 1L, 4L, 3L, 5L, 1L, 2L, 1L, 1L, 1L, 4L,
# 3L, 4L, 5L, 4L, 1L, 5L, 4L, 5L, 1L, 1L, 1L, 4L, 5L, 1L, 1L, 5L,
# 3L, 3L, 1L, 1L, 1L, 1L, 5L, 1L, 3L, 3L, 2L, 1L, 3L, 4L, 1L, 3L,
# 3L, 3L, 2L, 5L, 3L, 5L, 1L, 2L, 2L, 4L, 2L, 4L, 3L, 3L, 5L, 5L,
# 1L, 3L, 2L, 4L, 3L, 3L, 2L, 1L, 3L, 1L, 2L, 5L, 2L, 4L, 5L, 2L,
# 3L, 1L, 5L, 4L, 5L, 4L, 1L, 5L, 2L, 1L, 2L, 5L, 4L, 1L, 4L, 3L,
# 5L, 1L, 3L, 3L, 4L, 4L, 5L, 3L, 2L, 5L, 5L, 5L, 4L, 2L, 1L, 1L,
# 1L, 5L, 4L, 1L, 2L, 4L, 1L, 4L, 1L, 4L, 3L, 1L, 2L, 2L, 4L, 4L,
# 4L, 5L, 2L, 4L, 5L, 3L, 3L, 5L, 3L, 3L, 5L, 2L, 1L, 1L, 2L, 3L,
# 4L, 5L, 2L, 1L, 4L, 4L, 3L, 3L, 5L, 1L, 4L, 2L, 1L, 4L, 2L, 3L,
# 2L, 3L, 1L, 2L, 5L, 1L, 1L, 4L, 5L, 3L, 4L, 2L, 2L, 3L, 1L, 2L,
# 2L, 5L, 3L, 3L, 2L, 2L, 5L, 1L, 1L, 3L, 3L, 5L, 5L, 4L, 3L, 2L,
# 2L, 5L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 5L, 1L, 5L, 1L, 4L, 1L,
# 2L, 3L, 5L, 1L, 3L, 5L, 3L, 3L, 3L, 2L, 2L, 4L, 1L, 2L, 3L, 4L,
# 1L, 5L, 5L, 4L, 3L, 4L, 4L, 5L, 1L, 4L, 3L, 4L, 1L, 2L, 2L, 4L,
# 5L, 2L, 1L, 4L, 4L, 1L, 1L, 3L, 3L, 5L, 5L, 3L, 5L, 2L, 4L, 2L,
# 2L, 3L, 2L, 2L, 3L, 2L, 1L, 3L, 2L, 4L, 4L, 3L, 2L, 5L, 4L, 5L,
# 1L, 3L, 4L, 3L, 1L, 5L, 1L, 4L, 4L, 1L, 3L, 5L, 4L, 2L, 3L, 3L,
# 2L, 5L, 4L, 2L, 2L, 5L, 3L, 4L, 5L, 4L, 4L, 4L, 5L, 1L, 2L, 5L,
# 4L, 5L, 2L, 2L, 5L, 1L, 1L, 3L, 4L, 5L, 3L, 2L, 5L, 4L, 3L, 3L,
# 2L, 5L, 5L, 5L, 5L, 4L, 1L, 4L, 4L, 5L, 1L, 1L, 4L, 5L, 2L, 3L,
# 2L, 3L, 3L, 3L, 4L, 1L, 4L, 4L, 5L, 1L, 5L, 5L, 5L, 3L, 5L, 3L,
# 3L, 2L, 2L, 3L, 2L, 3L, 2L, 1L, 3L, 3L, 1L, 2L, 5L, 3L, 4L, 1L,
# 5L, 3L, 5L, 4L, 2L, 1L, 3L, 2L, 1L, 4L, 3L, 1L, 4L, 3L, 5L, 5L,
# 5L, 4L, 4L, 1L, 1L, 5L, 5L, 3L, 2L, 3L, 5L, 2L, 3L, 5L, 1L, 5L,
# 4L, 2L, 5L, 5L, 1L, 1L, 4L, 3L, 4L, 2L, 1L, 4L, 3L, 4L, 4L, 4L,
# 5L, 3L, 5L, 2L, 4L, 1L, 4L, 3L, 2L, 4L, 4L, 1L, 4L, 2L, 2L, 2L,
# 2L, 2L, 4L)
K-fold "random" split of the training dataset
#@ K-fold split of the training dataset -----
dataset.train = dataset.train %>% rownames_to_column
# Do not forget to set the random seed, before performing any randomization tasks (e.g., random sampling).
set.seed(12345); dataset.train$fold.index = dataset.train %>% function.vec.fold.index(k = 5) %>% sample
dataset.train %>% select(rowname, death, age1, bmi1, fold.index)
# # A tibble: 450 x 5
# rowname death age1 bmi1 fold.index
# <chr> <dbl> <dbl> <dbl> <int>
# 1 1 1 39 22.4 5
# 2 2 1 47 24.2 4
# 3 3 1 52 40.1 1
# 4 4 0 42 28.9 2
# 5 5 1 53 21.5 4
# 6 6 0 47 19.7 5
# 7 7 0 56 23.6 5
# 8 8 0 41 30.6 1
# 9 9 0 53 18.2 2
# 10 10 0 46 20.2 2
# # ... with 440 more rows
K-fold "random" split of the training dataset
dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index != 1)
dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index == 1)
# > dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index != 1)
# # A tibble: 360 x 5
# rowname death age1 bmi1 fold.index
# <chr> <dbl> <dbl> <dbl> <int>
# 1 1 1 39 22.4 5
# 2 2 1 47 24.2 4
# 3 4 0 42 28.9 2
# 4 5 1 53 21.5 4
# 5 6 0 47 19.7 5
# 6 7 0 56 23.6 5
# 7 9 0 53 18.2 2
# 8 10 0 46 20.2 2
# 9 12 0 40 26.8 2
# 10 13 1 56 33.0 3
# # ... with 350 more rows
# > dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index == 1)
# # A tibble: 90 x 5
# rowname death age1 bmi1 fold.index
# <chr> <dbl> <dbl> <dbl> <int>
# 1 3 1 52 40.1 1
# 2 8 0 41 30.6 1
# 3 11 0 41 20.5 1
# 4 14 1 47 22.0 1
# 5 15 0 38 22.8 1
# 6 20 1 56 30.8 1
# 7 21 0 60 27.3 1
# 8 22 0 64 27.5 1
# 9 26 0 39 31.0 1
# 10 28 0 55 27.8 1
# # ... with 80 more rows
K-fold "random" split of the training dataset
## Visual check of the distribution of the folds ----
dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = as.factor(fold.index))) + geom_point()
Fit multiple models in each cross-validation folds
#@ Nested for-loop: (1) Iteration of folds for cross-validation (2) Fit multiple models using for-loop =====
# Save the models as a "nested" list of objects to save the results from "nested" for-loop.
max.polynomial = 5
cv.model.list = list()
for (i.fold in sort(unique(dataset.train$fold.index))) {
cv.model.list[[i.fold]] = list()
dataset = dataset.train %>% filter(fold.index != i.fold) %>% as.data.frame
for (i in 1:max.polynomial) {
myformula = as.formula(paste0("death ~ poly(age1, ", i, ")", " + ", "poly(bmi1, ", i, ")"))
cv.model.list[[i.fold]][[i]] = glm(myformula, data = dataset, family = "binomial")
}
}
Calculate the training AUROC & validation AUROC for multiple
models in each cross-validation folds
#@ Define the loss function (optimization objective). -----
# Cf) You may define any function to avoid repetitive codes.
AUROC = function(y,yhat) mean((y-yhat)^2)
# Make a table that shows the training AUROC and test AUROC for each cross-validation & each model in the "nested"
model.list. -----
cv.df = data_frame(
cv = rep(1:k, each = max.polynomial)
, polynomial = rep(1:max.polynomial, k)
) %>% mutate(
trainAUROC = map2_dbl(cv, polynomial, function(i.fold, i) { cv.model.list[[i.fold]][[i]] %>% {AUROC(.$y,
predict(.)) } })
, cvAUROC = map2_dbl(cv, polynomial, function(i.fold, i) { AUROC(dataset.train %>% filter(fold.index ==
i.fold) %>% select(mpg) %>% unlist, predict(cv.model.list[[i.fold]][[i]], newdata = dataset.train %>%
filter(fold.index == i.fold))) } )
)
cv.df
# # A tibble: 50 x 4
# cv polynomial trainAUROC cvAUROC
# <int> <int> <dbl> <dbl>
# 1 1 1 22.5 19.3
# 2 1 2 18.9 16.3
# 3 1 3 18.9 16.3
# 4 1 4 18.5 17.4
# 5 1 5 17.6 17.4
# 6 1 6 17.4 16.9
# 7 1 7 17.2 16.3
# 8 1 8 17.0 17.9
# 9 1 9 17.0 17.7
# 10 1 10 17.0 17.6
# # ... with 40 more rows
Calculate the (aggregated) training AUROC & (aggregated) cv
AUROC for multiple models
# Make a table that shows the (aggregated) training error and test error for each model -----
cv.df.summarize = cv.df %>% select(-cv) %>% group_by(polynomial) %>% summarize_all(mean)
cv.df.summarize
# # A tibble: 10 x 3
# polynomial trainAUROC cvAUROC
# <int> <dbl> <dbl>
# 1 1 0.779 0.782
# 2 2 0.779 0.783
# 3 3 0.779 0.780
# 4 4 0.783 0.777
# 5 5 0.783 0.772
# 6 6 0.786 0.763
# 7 7 0.789 0.758
# 8 8 0.787 0.753
# 9 9 0.787 0.749
# 10 10 0.803 0.761
Visualize the (aggregated) training AUROC & cv AUROC for
multiple models
cv.df.summarize %>% gather(key, value, trainAUROC, cvAUROC) %>% ggplot(aes(x = polynomial, y = value, color =
key)) + geom_point() + geom_line()
MH prediction modeling and validation in r (2) classification 190709

More Related Content

What's hot

Machine Learning Algorithms
Machine Learning AlgorithmsMachine Learning Algorithms
Machine Learning Algorithms
Hichem Felouat
 
20180310 functional programming
20180310 functional programming20180310 functional programming
20180310 functional programming
Chiwon Song
 
Google TensorFlow Tutorial
Google TensorFlow TutorialGoogle TensorFlow Tutorial
Google TensorFlow Tutorial
台灣資料科學年會
 
Digital system design
Digital system designDigital system design
Digital system design
Kuntala Das
 
An introduction to functional programming with go
An introduction to functional programming with goAn introduction to functional programming with go
An introduction to functional programming with go
Eleanor McHugh
 
Introduction to python programming
Introduction to python programmingIntroduction to python programming
Introduction to python programming
Rakotoarison Louis Frederick
 
Java programs - bubble sort, iterator, linked list, hash set, reverse string,...
Java programs - bubble sort, iterator, linked list, hash set, reverse string,...Java programs - bubble sort, iterator, linked list, hash set, reverse string,...
Java programs - bubble sort, iterator, linked list, hash set, reverse string,...Sunil Kumar Gunasekaran
 
TensorFlow Tutorial
TensorFlow TutorialTensorFlow Tutorial
TensorFlow Tutorial
NamHyuk Ahn
 
Computer graphics lab report with code in cpp
Computer graphics lab report with code in cppComputer graphics lab report with code in cpp
Computer graphics lab report with code in cpp
Alamgir Hossain
 
Computer java programs
Computer java programsComputer java programs
Computer java programs
ADITYA BHARTI
 
Chapter2
Chapter2Chapter2
Chapter2
Krishna Kumar
 
Rcommands-for those who interested in R.
Rcommands-for those who interested in R.Rcommands-for those who interested in R.
Rcommands-for those who interested in R.
Dr. Volkan OBAN
 
TensorFlow
TensorFlowTensorFlow
TensorFlow
jirimaterna
 
Pricing Game, 100% Data Sciences
Pricing Game, 100% Data SciencesPricing Game, 100% Data Sciences
Pricing Game, 100% Data Sciences
Arthur Charpentier
 
Introduction to matlab
Introduction to matlabIntroduction to matlab
Introduction to matlabkrishna_093
 
Ocr code
Ocr codeOcr code
Ocr code
wi7sonjoseph
 
Python
PythonPython
Learning to Sample
Learning to SampleLearning to Sample
Learning to Sample
Pooyan Jamshidi
 
4366 chapter7
4366 chapter74366 chapter7
4366 chapter7Sai Kumar
 

What's hot (20)

Machine Learning Algorithms
Machine Learning AlgorithmsMachine Learning Algorithms
Machine Learning Algorithms
 
20180310 functional programming
20180310 functional programming20180310 functional programming
20180310 functional programming
 
Google TensorFlow Tutorial
Google TensorFlow TutorialGoogle TensorFlow Tutorial
Google TensorFlow Tutorial
 
Digital system design
Digital system designDigital system design
Digital system design
 
An introduction to functional programming with go
An introduction to functional programming with goAn introduction to functional programming with go
An introduction to functional programming with go
 
Introduction to python programming
Introduction to python programmingIntroduction to python programming
Introduction to python programming
 
OTLN2012
OTLN2012OTLN2012
OTLN2012
 
Java programs - bubble sort, iterator, linked list, hash set, reverse string,...
Java programs - bubble sort, iterator, linked list, hash set, reverse string,...Java programs - bubble sort, iterator, linked list, hash set, reverse string,...
Java programs - bubble sort, iterator, linked list, hash set, reverse string,...
 
TensorFlow Tutorial
TensorFlow TutorialTensorFlow Tutorial
TensorFlow Tutorial
 
Computer graphics lab report with code in cpp
Computer graphics lab report with code in cppComputer graphics lab report with code in cpp
Computer graphics lab report with code in cpp
 
Computer java programs
Computer java programsComputer java programs
Computer java programs
 
Chapter2
Chapter2Chapter2
Chapter2
 
Rcommands-for those who interested in R.
Rcommands-for those who interested in R.Rcommands-for those who interested in R.
Rcommands-for those who interested in R.
 
TensorFlow
TensorFlowTensorFlow
TensorFlow
 
Pricing Game, 100% Data Sciences
Pricing Game, 100% Data SciencesPricing Game, 100% Data Sciences
Pricing Game, 100% Data Sciences
 
Introduction to matlab
Introduction to matlabIntroduction to matlab
Introduction to matlab
 
Ocr code
Ocr codeOcr code
Ocr code
 
Python
PythonPython
Python
 
Learning to Sample
Learning to SampleLearning to Sample
Learning to Sample
 
4366 chapter7
4366 chapter74366 chapter7
4366 chapter7
 

Similar to MH prediction modeling and validation in r (2) classification 190709

gptips1.0concrete.matConcrete_Data[1030x9 double array]tr.docx
gptips1.0concrete.matConcrete_Data[1030x9  double array]tr.docxgptips1.0concrete.matConcrete_Data[1030x9  double array]tr.docx
gptips1.0concrete.matConcrete_Data[1030x9 double array]tr.docx
whittemorelucilla
 
error 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docx
error 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docxerror 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docx
error 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docx
SALU18
 
Review questions and answers
Review questions and answersReview questions and answers
Review questions and answers
IIUM
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
Dr. Volkan OBAN
 
Recursion in C
Recursion in CRecursion in C
Recursion in C
Lakshmi Sarvani Videla
 
Regression and Classification with R
Regression and Classification with RRegression and Classification with R
Regression and Classification with R
Yanchang Zhao
 
Basic c programs updated on 31.8.2020
Basic c programs updated on 31.8.2020Basic c programs updated on 31.8.2020
Basic c programs updated on 31.8.2020
vrgokila
 
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavSeminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavVyacheslav Arbuzov
 
Bigger Data v Better Math
Bigger Data v Better MathBigger Data v Better Math
Bigger Data v Better Math
Brent Schneeman
 
Basic R Data Manipulation
Basic R Data ManipulationBasic R Data Manipulation
Basic R Data Manipulation
Chu An
 
Cpl
CplCpl
R programming language
R programming languageR programming language
R programming language
Alberto Minetti
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
Dr. Volkan OBAN
 
Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.
Dr. Volkan OBAN
 
Frsa
FrsaFrsa
Frsa
_111
 
Dem 7263 fall 2015 Spatial GLM's
Dem 7263 fall 2015 Spatial GLM'sDem 7263 fall 2015 Spatial GLM's
Dem 7263 fall 2015 Spatial GLM's
Corey Sparks
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
Dr. Volkan OBAN
 

Similar to MH prediction modeling and validation in r (2) classification 190709 (20)

gptips1.0concrete.matConcrete_Data[1030x9 double array]tr.docx
gptips1.0concrete.matConcrete_Data[1030x9  double array]tr.docxgptips1.0concrete.matConcrete_Data[1030x9  double array]tr.docx
gptips1.0concrete.matConcrete_Data[1030x9 double array]tr.docx
 
error 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docx
error 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docxerror 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docx
error 2.pdf101316, 6(46 PM01_errorPage 1 of 5http.docx
 
Review questions and answers
Review questions and answersReview questions and answers
Review questions and answers
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
 
Recursion in C
Recursion in CRecursion in C
Recursion in C
 
Joclad 2010 d
Joclad 2010 dJoclad 2010 d
Joclad 2010 d
 
Regression and Classification with R
Regression and Classification with RRegression and Classification with R
Regression and Classification with R
 
Basic c programs updated on 31.8.2020
Basic c programs updated on 31.8.2020Basic c programs updated on 31.8.2020
Basic c programs updated on 31.8.2020
 
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavSeminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
 
Bigger Data v Better Math
Bigger Data v Better MathBigger Data v Better Math
Bigger Data v Better Math
 
Basic R Data Manipulation
Basic R Data ManipulationBasic R Data Manipulation
Basic R Data Manipulation
 
Cpl
CplCpl
Cpl
 
R programming language
R programming languageR programming language
R programming language
 
Vcs16
Vcs16Vcs16
Vcs16
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.
 
Frsa
FrsaFrsa
Frsa
 
Survey Demo
Survey DemoSurvey Demo
Survey Demo
 
Dem 7263 fall 2015 Spatial GLM's
Dem 7263 fall 2015 Spatial GLM'sDem 7263 fall 2015 Spatial GLM's
Dem 7263 fall 2015 Spatial GLM's
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
 

More from Min-hyung Kim

20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf
20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf
20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf
Min-hyung Kim
 
20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx
20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx
20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx
Min-hyung Kim
 
MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709
Min-hyung Kim
 
MH Prediction Modeling and Validation -clean
MH Prediction Modeling and Validation -cleanMH Prediction Modeling and Validation -clean
MH Prediction Modeling and Validation -clean
Min-hyung Kim
 
r for data science 2. grammar of graphics (ggplot2) clean -ref
r for data science 2. grammar of graphics (ggplot2)  clean -refr for data science 2. grammar of graphics (ggplot2)  clean -ref
r for data science 2. grammar of graphics (ggplot2) clean -ref
Min-hyung Kim
 
r for data science 4. exploratory data analysis clean -rev -ref
r for data science 4. exploratory data analysis  clean -rev -refr for data science 4. exploratory data analysis  clean -rev -ref
r for data science 4. exploratory data analysis clean -rev -ref
Min-hyung Kim
 
CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403
CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403
CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403
Min-hyung Kim
 

More from Min-hyung Kim (7)

20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf
20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf
20230511 Automation of EMR Tasks using AutoHotkey in MS Windows_MKv1.1.pdf
 
20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx
20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx
20221001 KAFM 의학 형의상학(Medical Ontology) v5 -clean.pptx
 
MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709
 
MH Prediction Modeling and Validation -clean
MH Prediction Modeling and Validation -cleanMH Prediction Modeling and Validation -clean
MH Prediction Modeling and Validation -clean
 
r for data science 2. grammar of graphics (ggplot2) clean -ref
r for data science 2. grammar of graphics (ggplot2)  clean -refr for data science 2. grammar of graphics (ggplot2)  clean -ref
r for data science 2. grammar of graphics (ggplot2) clean -ref
 
r for data science 4. exploratory data analysis clean -rev -ref
r for data science 4. exploratory data analysis  clean -rev -refr for data science 4. exploratory data analysis  clean -rev -ref
r for data science 4. exploratory data analysis clean -rev -ref
 
CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403
CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403
CDM SynPuf OMOP CDM library(rodbc) library(ggplot2) library(jsonlite) 180403
 

Recently uploaded

一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
slg6lamcq
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Subhajit Sahu
 
一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理
一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理
一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理
g4dpvqap0
 
原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样
原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样
原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样
u86oixdj
 
My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.
rwarrenll
 
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
slg6lamcq
 
Adjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTESAdjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTES
Subhajit Sahu
 
Nanandann Nilekani's ppt On India's .pdf
Nanandann Nilekani's ppt On India's .pdfNanandann Nilekani's ppt On India's .pdf
Nanandann Nilekani's ppt On India's .pdf
eddie19851
 
Influence of Marketing Strategy and Market Competition on Business Plan
Influence of Marketing Strategy and Market Competition on Business PlanInfluence of Marketing Strategy and Market Competition on Business Plan
Influence of Marketing Strategy and Market Competition on Business Plan
jerlynmaetalle
 
Data_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptx
Data_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptxData_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptx
Data_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptx
AnirbanRoy608946
 
一比一原版(UofS毕业证书)萨省大学毕业证如何办理
一比一原版(UofS毕业证书)萨省大学毕业证如何办理一比一原版(UofS毕业证书)萨省大学毕业证如何办理
一比一原版(UofS毕业证书)萨省大学毕业证如何办理
v3tuleee
 
做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样
做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样
做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样
axoqas
 
The affect of service quality and online reviews on customer loyalty in the E...
The affect of service quality and online reviews on customer loyalty in the E...The affect of service quality and online reviews on customer loyalty in the E...
The affect of service quality and online reviews on customer loyalty in the E...
jerlynmaetalle
 
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
u86oixdj
 
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
John Andrews
 
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
axoqas
 
Global Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headedGlobal Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headed
vikram sood
 
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdfCh03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
haila53
 
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data LakeViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
Walaa Eldin Moustafa
 
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTESAdjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
Subhajit Sahu
 

Recently uploaded (20)

一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
 
一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理
一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理
一比一原版(爱大毕业证书)爱丁堡大学毕业证如何办理
 
原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样
原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样
原版制作(swinburne毕业证书)斯威本科技大学毕业证毕业完成信一模一样
 
My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.
 
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
 
Adjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTESAdjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTES
 
Nanandann Nilekani's ppt On India's .pdf
Nanandann Nilekani's ppt On India's .pdfNanandann Nilekani's ppt On India's .pdf
Nanandann Nilekani's ppt On India's .pdf
 
Influence of Marketing Strategy and Market Competition on Business Plan
Influence of Marketing Strategy and Market Competition on Business PlanInfluence of Marketing Strategy and Market Competition on Business Plan
Influence of Marketing Strategy and Market Competition on Business Plan
 
Data_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptx
Data_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptxData_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptx
Data_and_Analytics_Essentials_Architect_an_Analytics_Platform.pptx
 
一比一原版(UofS毕业证书)萨省大学毕业证如何办理
一比一原版(UofS毕业证书)萨省大学毕业证如何办理一比一原版(UofS毕业证书)萨省大学毕业证如何办理
一比一原版(UofS毕业证书)萨省大学毕业证如何办理
 
做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样
做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样
做(mqu毕业证书)麦考瑞大学毕业证硕士文凭证书学费发票原版一模一样
 
The affect of service quality and online reviews on customer loyalty in the E...
The affect of service quality and online reviews on customer loyalty in the E...The affect of service quality and online reviews on customer loyalty in the E...
The affect of service quality and online reviews on customer loyalty in the E...
 
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
 
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
 
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
 
Global Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headedGlobal Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headed
 
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdfCh03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
 
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data LakeViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
 
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTESAdjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
 

MH prediction modeling and validation in r (2) classification 190709

  • 1. Install required R software packages for (packagename in c("tidyverse", "openxlsx")) { if(!require(packagename, character.only = T)) {install.packages(packagename); require(packagename, character.only = T)} }
  • 2. Imputation for the training dataset #@ Load only the training data. Make sure the test data is not loaded before modeling is finished. ---- # If the test data is not sequestered, make a random split and save them first. Then only load the training data, so that the test data remain unseen before modeling is finished. ---- dataset.train = readRDS(url("https://github.com/mkim0710/PH207x/blob/master/fhs.index100le10.rds?raw=true")) # Here, we will use a single regression imputation for simplicity. However, multiple imputation is recommended. imputation.model = glm(bmi1 ~ poly(age1, 2) + sex1, data = dataset.train) dataset.train = dataset.train %>% mutate( bmi1.old = bmi1 , bmi1.is_imputed = bmi1 %>% {ifelse(is.na(.), T, F)} , bmi1 = bmi1 %>% {ifelse(is.na(.), predict(imputation.model), .)} ) dataset.train %>% select(randid, death, age1, sex1, matches("bmi1")) %>% filter(bmi1.is_imputed) # # A tibble: 2 x 7 # randid death age1 sex1 bmi1 bmi1.old bmi1.is_imputed # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> # 1 1600765 0 45 2 26.9 NA TRUE # 2 6921140 1 64 1 26.3 NA TRUE
  • 3. Visualize the training dataset with the labels dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = death)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") + theme_minimal()
  • 4.
  • 5. Logistic model (1) model1 = glm(death ~ poly(age1, 1, raw = T) + poly(bmi1, 1, raw = T), family = "binomial", data = dataset.train) model1 %>% summary #---- # Call: # glm(formula = death ~ poly(age1, 1, raw = T) + poly(bmi1, 1, # raw = T), family = "binomial", data = dataset.train) # # Deviance Residuals: # Min 1Q Median 3Q Max # -1.8968 -0.7632 -0.4850 0.8773 2.5501 # # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) -8.51201 1.06490 -7.993 1.31e-15 *** # poly(age1, 1, raw = T) 0.13326 0.01533 8.696 < 2e-16 *** # poly(bmi1, 1, raw = T) 0.03576 0.02895 1.235 0.217 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # # (Dispersion parameter for binomial family taken to be 1) # # Null deviance: 568.61 on 449 degrees of freedom # Residual deviance: 465.63 on 447 degrees of freedom # AIC: 471.63 # # Number of Fisher Scoring iterations: 4 dataset.train = dataset.train %>% mutate(death.model1.predict.prob = predict(model1, type = "response", newdata = .))
  • 6. Visualize the logistic model (1) with the fitted probability dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.model1.predict = predict(model1, type = "response")) %>% ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") + theme_minimal()
  • 7. The estimated probability can be thresholded (dichotomized) for a binary classification. https://github.com/kenhktsui/Visualizing-Logistic-Regression
  • 8. Visualize the logistic model (1) with a cutoff of mean cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.model1.predict = predict(model1, type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal()
  • 9. Visualize the logistic model (1) with a cutoff of 0.5 cutoff.value = 0.5 dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.model1.predict = predict(model1, type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.model1.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal()
  • 10.
  • 11. Logistic model (2) model2 = glm(death ~ poly(age1, 2, raw = T) + poly(bmi1, 2, raw = T), family = "binomial", data = dataset.train) model2 %>% summary #---- # Call: # glm(formula = death ~ poly(age1, 2, raw = T) + poly(bmi1, 2, # raw = T), family = "binomial", data = dataset.train) # # Deviance Residuals: # Min 1Q Median 3Q Max # -2.2106 -0.7227 -0.5243 0.8057 2.1919 # # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) 3.806304 5.819063 0.654 0.5130 # poly(age1, 2, raw = T)1 -0.206718 0.198164 -1.043 0.2969 # poly(age1, 2, raw = T)2 0.003295 0.001924 1.713 0.0868 . # poly(bmi1, 2, raw = T)1 -0.247094 0.251485 -0.983 0.3258 # poly(bmi1, 2, raw = T)2 0.005231 0.004555 1.148 0.2508 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # # (Dispersion parameter for binomial family taken to be 1) # # Null deviance: 568.61 on 449 degrees of freedom # Residual deviance: 461.12 on 445 degrees of freedom # AIC: 471.12 # # Number of Fisher Scoring iterations: 4 dataset.train = dataset.train %>% mutate(death.model2.predict.prob = predict(model2, type = "response"))
  • 12. Visualize the logistic model (2) with the fitted probability dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.model2.predict = predict(model2, type = "response")) %>% ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") + theme_minimal()
  • 13. Visualize the logistic model (2) with a cutoff of mean cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.model2.predict = predict(model2, type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal()
  • 14. Visualize the logistic model (2) with a cutoff of 0.5 cutoff.value = 0.5 dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.model2.predict = predict(model2, type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.model2.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal()
  • 15.
  • 16. Performance of a binary classification test: Sensitivity, specificity, PPV, NPV, ... https://en.wikipedia.org/wiki/Receiver_operating_characteristic
  • 17. Receiver operating characteristic (ROC) https://en.wikipedia.org/wiki/Receiver_operating_characteristic • Let a conditions random variable X denote the probability estimated for the observation. • Given a threshold parameter T, • The observation is classified as "positive" if X > T, and "negative" otherwise. • X follows a probability density f1(x) if the instance actually belongs to class "positive", and f0(x) if otherwise. • Therefore, • the true positive rate TPR(T) = ‫׬‬𝑇 ∞ 𝑓1 𝑥 𝑑𝑥 • and the false positive rate FPR(T) = ‫׬‬𝑇 ∞ 𝑓0 𝑥 𝑑𝑥 • The ROC curve plots parametrically TPR(T) versus FPR(T)
  • 18. Area under the ROC curve (AUROC) https://en.wikipedia.org/wiki/Receiver_operating_characteristic • The ROC curve plots parametrically TPR(T) versus FPR(T) • Area under the ROC curve (AUROC) • When using normalized units, the AUROC is equal to the probability that a classifier will rank a randomly chosen positive instance higher than a randomly chosen negative one (assuming 'positive' ranks higher than 'negative'). • TPR(T): T -> y(x) • FPR(T): T -> x • Large T corresponds to a lower value of x 𝐴𝑈𝑅𝑂𝐶 = න 𝑥=0 1 𝑇𝑃𝑅 𝐹𝑃𝑅−1 𝑥 𝑑𝑥 = න ∞ −∞ 𝑇𝑃𝑅 𝑇 𝐹𝑃𝑅′ 𝑇 𝑑𝑇 = න −∞ ∞ න −∞ ∞ 𝐼 𝑇′ > 𝑇 𝑓1 𝑇′ 𝑓0 𝑇 𝑑𝑇′𝑑𝑇 = 𝑃 𝑋1 > 𝑋0 , where X1 is the estimated probability for a positive observation, X0 is the estimated probability for a negative observation, X follows a probability density f1(x) if the instance actually belongs to class "positive", and f0(x) if otherwise.
  • 19. Calculate the training AUROC #@ functions for calculating AUROC ==== function.vec_actual_prediction.threshold_roc = function(vec_actual, vec_prediction) { out = tibble(threshold = vec_prediction %>% unique %>% sort(decreasing = F) %>% {(. + lag(.))/2} %>% replace_na(-Inf) %>% {c(., Inf)} ) %>% mutate( TP = threshold %>% map_dbl(function(i) {sum(vec_actual == T & vec_prediction >= i)}) , FP = threshold %>% map_dbl(function(i) {sum(vec_actual != T & vec_prediction >= i)}) , FN = threshold %>% map_dbl(function(i) {sum(vec_actual == T & vec_prediction < i)}) , TN = threshold %>% map_dbl(function(i) {sum(vec_actual != T & vec_prediction < i)}) , Sensitivity = TP/(TP+FN) , Specificity = TN/(TN+FP) ) out } function.threshold_roc.auc = function(object.threshold_roc) { tmp_df = object.threshold_roc %>% mutate( TPR = Sensitivity , FPR = 1 - Specificity ) %>% arrange(FPR, TPR) %>% mutate( dFPR = c(diff(FPR), 0) , dTPR = c(diff(TPR), 0) ) # inputs already sorted, best predictions first tmp_df %>% with(sum(TPR * dFPR) + sum(dTPR * dFPR)/2) }
  • 20. Calculate the training AUROC (1) dataset.train.threshold_roc.model1 = function.vec_actual_prediction.threshold_roc(dataset.train$death, dataset.train$death.model1.predict.prob) dataset.train.threshold_roc.model1 %>% as.tibble # # A tibble: 449 x 7 # threshold TP FP FN TN Sensitivity Specificity # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 -Inf 147 303 0 0 1 0 # 2 0.0388 146 303 1 0 0.993 0 # 3 0.0406 146 302 1 1 0.993 0.00330 # 4 0.0456 146 301 1 2 0.993 0.00660 # 5 0.0495 146 300 1 3 0.993 0.00990 # 6 0.0506 146 299 1 4 0.993 0.0132 # 7 0.0521 146 298 1 5 0.993 0.0165 # 8 0.0540 145 298 2 5 0.986 0.0165 # 9 0.0554 145 297 2 6 0.986 0.0198 # 10 0.0566 145 296 2 7 0.986 0.0231 # # ... with 439 more rows dataset.train.threshold_roc.model1 %>% function.threshold_roc.auc # [1] 0.7788554
  • 21. Calculate the training AUROC (2) dataset.train.threshold_roc.model2 = function.vec_actual_prediction.threshold_roc(dataset.train$death, dataset.train$death.model2.predict.prob) dataset.train.threshold_roc.model2 %>% as.tibble # # A tibble: 449 x 7 # threshold TP FP FN TN Sensitivity Specificity # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 -Inf 147 303 0 0 1 0 # 2 0.0917 146 303 1 0 0.993 0 # 3 0.0931 146 302 1 1 0.993 0.00330 # 4 0.0939 146 301 1 2 0.993 0.00660 # 5 0.0944 146 300 1 3 0.993 0.00990 # 6 0.0948 146 299 1 4 0.993 0.0132 # 7 0.0953 146 298 1 5 0.993 0.0165 # 8 0.0955 146 297 1 6 0.993 0.0198 # 9 0.0956 146 296 1 7 0.993 0.0231 # 10 0.0958 146 295 1 8 0.993 0.0264 # # ... with 439 more rows dataset.train.threshold_roc.model2 %>% function.threshold_roc.auc # [1] 0.7787656
  • 22.
  • 23. Imputation for the test dataset #@ Loading the test dataset (after modeling is finished~!) ----- dataset.test = readRDS(url("https://github.com/mkim0710/PH207x/blob/master/fhs.index100ge11le20.rds?raw=true")) # #@ imputation.model = glm(bmi1 ~ poly(age1, 2) + sex1, data = dataset.train) ==== dataset.test = dataset.test %>% mutate( bmi1.old = bmi1 , bmi1.is_imputed = bmi1 %>% {ifelse(is.na(.), T, F)} , bmi1 = bmi1 %>% {ifelse(is.na(.), predict(imputation.model, newdata = dataset.test), .)} ) dataset.test %>% select(randid, death, age1, sex1, matches("bmi1")) %>% filter(bmi1.is_imputed) # # A tibble: 2 x 7 # randid death age1 sex1 bmi1 bmi1.old bmi1.is_imputed # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> # 1 2668575 1 64 2 26.2 NA TRUE # 2 6026757 0 39 2 24.1 NA TRUE
  • 24. Calculate the test AUROC #@ predict using dataset.test ---- dataset.test = dataset.test %>% mutate(death.model1.predict.prob = predict(model1, type = "response", newdata = .)) dataset.test = dataset.test %>% mutate(death.model2.predict.prob = predict(model2, type = "response", newdata = .)) dataset.test %>% select(randid, death, age1, sex1, bmi1, matches("predict.prob")) # # A tibble: 450 x 7 # randid death age1 sex1 bmi1 death.model1.predict.p~ death.model2.predict.~ # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 16799 0 50 2 22.9 0.263 0.230 # 2 69134 1 59 2 20.8 0.523 0.551 # 3 97895 1 65 2 30.5 0.776 0.835 # 4 110542 1 63 2 27.1 0.701 0.733 # 5 170881 1 63 2 29.4 0.718 0.753 # 6 192229 1 39 1 32.5 0.104 0.148 # 7 209115 1 60 1 28.6 0.624 0.617 # 8 309808 0 43 1 25.5 0.134 0.131 # 9 388279 0 39 1 28.3 0.0910 0.115 # 10 431963 0 48 1 24.1 0.222 0.192 # # ... with 440 more rows
  • 25. Calculate the test AUROC dataset.test.threshold_roc.model1 = function.vec_actual_prediction.threshold_roc(dataset.test$death, dataset.test$death.model1.predict.prob) dataset.test.threshold_roc.model1 %>% as.tibble # # A tibble: 447 x 7 # threshold TP FP FN TN Sensitivity Specificity # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 -Inf 159 291 0 0 1 0 # 2 0.0414 159 290 0 1 1 0.00344 # 3 0.0434 159 289 0 2 1 0.00687 # 4 0.0466 159 288 0 3 1 0.0103 # 5 0.0489 158 288 1 3 0.994 0.0103 # 6 0.0509 158 287 1 4 0.994 0.0137 # 7 0.0517 158 286 1 5 0.994 0.0172 # 8 0.0524 158 285 1 6 0.994 0.0206 # 9 0.0531 158 284 1 7 0.994 0.0241 # 10 0.0532 158 283 1 8 0.994 0.0275 # # ... with 437 more rows dataset.test.threshold_roc.model1 %>% function.threshold_roc.auc # [1] 0.7603255
  • 26. Calculate the test AUROC (2) dataset.test.threshold_roc.model2 = function.vec_actual_prediction.threshold_roc(dataset.test$death, dataset.test$death.model2.predict.prob) dataset.test.threshold_roc.model2 %>% as.tibble # # A tibble: 447 x 7 # threshold TP FP FN TN Sensitivity Specificity # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 -Inf 159 291 0 0 1 0 # 2 0.0898 159 290 0 1 1 0.00344 # 3 0.0920 159 289 0 2 1 0.00687 # 4 0.0932 159 288 0 3 1 0.0103 # 5 0.0935 159 287 0 4 1 0.0137 # 6 0.0942 159 286 0 5 1 0.0172 # 7 0.0953 159 285 0 6 1 0.0206 # 8 0.0959 159 284 0 7 1 0.0241 # 9 0.0964 159 283 0 8 1 0.0275 # 10 0.0971 159 282 0 9 1 0.0309 # # ... with 437 more rows dataset.test.threshold_roc.model2 %>% function.threshold_roc.auc # [1] 0.7608442
  • 27. ROC curves overlaid dataset.test.threshold_roc.model1 %>% arrange(1 - Specificity, Sensitivity) %>% ggplot(aes(x = 1 - Specificity, y = Sensitivity)) + geom_line(color = "red") + geom_line(color = "blue", data = dataset.test.threshold_roc.model2 %>% arrange(1 - Specificity, Sensitivity)) + coord_cartesian(xlim = c(0,1), ylim = c(0,1))
  • 28.
  • 29. Fit multiple models using for-loop #@ Fit multiple models using for-loop, and then save the models as R list of objects. ===== model.list = list() for (i in 1:10) { myformula = as.formula(paste0("death ~ poly(age1, ", i, ")", " + ", "poly(bmi1, ", i, ")")) model.list[[i]] = glm(myformula, data = dataset.train, family = "binomial") }
  • 30. Calculate the training AUROC & test AUROC for multiple models # Make a table that shows the training AUROC and test AUROC for each model in the model.list. ----- df = data.frame( i = 1:length(model.list) , trainAUROC = model.list %>% map_dbl(function(model.object) { dataset.train %>% { function.vec_actual_prediction.threshold_roc(.$death, predict(model.object, type = "response", newdata = .)) } %>% function.threshold_roc.auc }) , testAUROC = model.list %>% map_dbl(function(model.object) { dataset.test %>% { function.vec_actual_prediction.threshold_roc(.$death, predict(model.object, type = "response", newdata = .)) } %>% function.threshold_roc.auc }) ) df # i trainAUROC testAUROC # 1 1 0.7788554 0.7603255 # 2 2 0.7787656 0.7608442 # 3 3 0.7779349 0.7668525 # 4 4 0.7819762 0.7646480 # 5 5 0.7822680 0.7607577 # 6 6 0.7821109 0.7607577 # 7 7 0.7861296 0.7568242 # 8 8 0.7820884 0.7510752 # 9 9 0.7829640 0.7449156 # 10 10 0.8003413 0.7516156 #@ Remove the test dataset (before any additional modeling~!) ----- rm(dataset.test)
  • 31. Calculate the training AUROC & test AUROC for multiple models df %>% gather(key, value, trainAUROC, testAUROC) %>% ggplot(aes(x = i, y = value, color = key)) + geom_point() + geom_line()
  • 32.
  • 33. Visualize the training dataset with the labels dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = death)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_gradient(low = "#0000ff80", high = "#ff000080") + theme_minimal()
  • 34. Visualize the logistic model (10) with a cutoff of mean i = 10 cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() + labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
  • 35. Visualize the logistic model (9) with a cutoff of mean i = 9 cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() + labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
  • 36. Visualize the logistic model (8) with a cutoff of mean i = 8 cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() + labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
  • 37. Visualize the logistic model (5) with a cutoff of mean i = 5 cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() + labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
  • 38. Visualize the logistic model (3) with a cutoff of mean i = 3 cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() + labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
  • 39. Visualize the logistic model (2) with a cutoff of mean i = 2 cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() + labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
  • 40. Visualize the logistic model (1) with a cutoff of mean i = 1 cutoff.value = dataset.train$death %>% mean dataset.train %>% select(randid, death, age1, sex1, bmi1) %>% mutate(death.predict = predict(model.list[[i]], type = "response") > cutoff.value) %>% ggplot(aes(x = age1, y = bmi1, color = death.predict)) + geom_point(data = dataset.train %>% filter(bmi1.is_imputed), color = "black") + geom_point(size = 10, alpha = .2) + scale_color_manual(values = c("#0000ff80", "#ff000080")) + theme_minimal() + labs(title = paste0("model.list[[", i, "]]", " with a cutoff of mean"))
  • 41.
  • 42.
  • 43. K-fold "random" split of the training dataset #@ K-fold "random" split of the training dataset ----- function.vec.fold.index = function(data, k = 5) data %>% { rep(1:k, (nrow(.) %/% k) + 1) [1:nrow(.)] } dataset.train %>% function.vec.fold.index(k = 5) %>% dput set.seed(12345); dataset.train %>% function.vec.fold.index(k = 5) %>% sample %>% dput # c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, # 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, # 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, # 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, # 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, # 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, # 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, # 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, # 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, # 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, # 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, # 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, # 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, # 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, # 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, # 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, # 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, # 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, # 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, # 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, # 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, # 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, # 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, # 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, # 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, # 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, # 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, # 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, # 3L, 4L, 5L) # c(5L, 4L, 1L, 2L, 4L, 5L, 5L, 1L, 2L, 2L, 1L, 2L, 3L, 1L, 1L, # 2L, 4L, 5L, 3L, 1L, 1L, 1L, 4L, 3L, 5L, 1L, 2L, 1L, 1L, 1L, 4L, # 3L, 4L, 5L, 4L, 1L, 5L, 4L, 5L, 1L, 1L, 1L, 4L, 5L, 1L, 1L, 5L, # 3L, 3L, 1L, 1L, 1L, 1L, 5L, 1L, 3L, 3L, 2L, 1L, 3L, 4L, 1L, 3L, # 3L, 3L, 2L, 5L, 3L, 5L, 1L, 2L, 2L, 4L, 2L, 4L, 3L, 3L, 5L, 5L, # 1L, 3L, 2L, 4L, 3L, 3L, 2L, 1L, 3L, 1L, 2L, 5L, 2L, 4L, 5L, 2L, # 3L, 1L, 5L, 4L, 5L, 4L, 1L, 5L, 2L, 1L, 2L, 5L, 4L, 1L, 4L, 3L, # 5L, 1L, 3L, 3L, 4L, 4L, 5L, 3L, 2L, 5L, 5L, 5L, 4L, 2L, 1L, 1L, # 1L, 5L, 4L, 1L, 2L, 4L, 1L, 4L, 1L, 4L, 3L, 1L, 2L, 2L, 4L, 4L, # 4L, 5L, 2L, 4L, 5L, 3L, 3L, 5L, 3L, 3L, 5L, 2L, 1L, 1L, 2L, 3L, # 4L, 5L, 2L, 1L, 4L, 4L, 3L, 3L, 5L, 1L, 4L, 2L, 1L, 4L, 2L, 3L, # 2L, 3L, 1L, 2L, 5L, 1L, 1L, 4L, 5L, 3L, 4L, 2L, 2L, 3L, 1L, 2L, # 2L, 5L, 3L, 3L, 2L, 2L, 5L, 1L, 1L, 3L, 3L, 5L, 5L, 4L, 3L, 2L, # 2L, 5L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 5L, 1L, 5L, 1L, 4L, 1L, # 2L, 3L, 5L, 1L, 3L, 5L, 3L, 3L, 3L, 2L, 2L, 4L, 1L, 2L, 3L, 4L, # 1L, 5L, 5L, 4L, 3L, 4L, 4L, 5L, 1L, 4L, 3L, 4L, 1L, 2L, 2L, 4L, # 5L, 2L, 1L, 4L, 4L, 1L, 1L, 3L, 3L, 5L, 5L, 3L, 5L, 2L, 4L, 2L, # 2L, 3L, 2L, 2L, 3L, 2L, 1L, 3L, 2L, 4L, 4L, 3L, 2L, 5L, 4L, 5L, # 1L, 3L, 4L, 3L, 1L, 5L, 1L, 4L, 4L, 1L, 3L, 5L, 4L, 2L, 3L, 3L, # 2L, 5L, 4L, 2L, 2L, 5L, 3L, 4L, 5L, 4L, 4L, 4L, 5L, 1L, 2L, 5L, # 4L, 5L, 2L, 2L, 5L, 1L, 1L, 3L, 4L, 5L, 3L, 2L, 5L, 4L, 3L, 3L, # 2L, 5L, 5L, 5L, 5L, 4L, 1L, 4L, 4L, 5L, 1L, 1L, 4L, 5L, 2L, 3L, # 2L, 3L, 3L, 3L, 4L, 1L, 4L, 4L, 5L, 1L, 5L, 5L, 5L, 3L, 5L, 3L, # 3L, 2L, 2L, 3L, 2L, 3L, 2L, 1L, 3L, 3L, 1L, 2L, 5L, 3L, 4L, 1L, # 5L, 3L, 5L, 4L, 2L, 1L, 3L, 2L, 1L, 4L, 3L, 1L, 4L, 3L, 5L, 5L, # 5L, 4L, 4L, 1L, 1L, 5L, 5L, 3L, 2L, 3L, 5L, 2L, 3L, 5L, 1L, 5L, # 4L, 2L, 5L, 5L, 1L, 1L, 4L, 3L, 4L, 2L, 1L, 4L, 3L, 4L, 4L, 4L, # 5L, 3L, 5L, 2L, 4L, 1L, 4L, 3L, 2L, 4L, 4L, 1L, 4L, 2L, 2L, 2L, # 2L, 2L, 4L)
  • 44. K-fold "random" split of the training dataset #@ K-fold split of the training dataset ----- dataset.train = dataset.train %>% rownames_to_column # Do not forget to set the random seed, before performing any randomization tasks (e.g., random sampling). set.seed(12345); dataset.train$fold.index = dataset.train %>% function.vec.fold.index(k = 5) %>% sample dataset.train %>% select(rowname, death, age1, bmi1, fold.index) # # A tibble: 450 x 5 # rowname death age1 bmi1 fold.index # <chr> <dbl> <dbl> <dbl> <int> # 1 1 1 39 22.4 5 # 2 2 1 47 24.2 4 # 3 3 1 52 40.1 1 # 4 4 0 42 28.9 2 # 5 5 1 53 21.5 4 # 6 6 0 47 19.7 5 # 7 7 0 56 23.6 5 # 8 8 0 41 30.6 1 # 9 9 0 53 18.2 2 # 10 10 0 46 20.2 2 # # ... with 440 more rows
  • 45. K-fold "random" split of the training dataset dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index != 1) dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index == 1) # > dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index != 1) # # A tibble: 360 x 5 # rowname death age1 bmi1 fold.index # <chr> <dbl> <dbl> <dbl> <int> # 1 1 1 39 22.4 5 # 2 2 1 47 24.2 4 # 3 4 0 42 28.9 2 # 4 5 1 53 21.5 4 # 5 6 0 47 19.7 5 # 6 7 0 56 23.6 5 # 7 9 0 53 18.2 2 # 8 10 0 46 20.2 2 # 9 12 0 40 26.8 2 # 10 13 1 56 33.0 3 # # ... with 350 more rows # > dataset.train %>% select(rowname, death, age1, bmi1, fold.index) %>% filter(fold.index == 1) # # A tibble: 90 x 5 # rowname death age1 bmi1 fold.index # <chr> <dbl> <dbl> <dbl> <int> # 1 3 1 52 40.1 1 # 2 8 0 41 30.6 1 # 3 11 0 41 20.5 1 # 4 14 1 47 22.0 1 # 5 15 0 38 22.8 1 # 6 20 1 56 30.8 1 # 7 21 0 60 27.3 1 # 8 22 0 64 27.5 1 # 9 26 0 39 31.0 1 # 10 28 0 55 27.8 1 # # ... with 80 more rows
  • 46. K-fold "random" split of the training dataset ## Visual check of the distribution of the folds ---- dataset.train %>% ggplot(aes(x = age1, y = bmi1, color = as.factor(fold.index))) + geom_point()
  • 47.
  • 48. Fit multiple models in each cross-validation folds #@ Nested for-loop: (1) Iteration of folds for cross-validation (2) Fit multiple models using for-loop ===== # Save the models as a "nested" list of objects to save the results from "nested" for-loop. max.polynomial = 5 cv.model.list = list() for (i.fold in sort(unique(dataset.train$fold.index))) { cv.model.list[[i.fold]] = list() dataset = dataset.train %>% filter(fold.index != i.fold) %>% as.data.frame for (i in 1:max.polynomial) { myformula = as.formula(paste0("death ~ poly(age1, ", i, ")", " + ", "poly(bmi1, ", i, ")")) cv.model.list[[i.fold]][[i]] = glm(myformula, data = dataset, family = "binomial") } }
  • 49. Calculate the training AUROC & validation AUROC for multiple models in each cross-validation folds #@ Define the loss function (optimization objective). ----- # Cf) You may define any function to avoid repetitive codes. AUROC = function(y,yhat) mean((y-yhat)^2) # Make a table that shows the training AUROC and test AUROC for each cross-validation & each model in the "nested" model.list. ----- cv.df = data_frame( cv = rep(1:k, each = max.polynomial) , polynomial = rep(1:max.polynomial, k) ) %>% mutate( trainAUROC = map2_dbl(cv, polynomial, function(i.fold, i) { cv.model.list[[i.fold]][[i]] %>% {AUROC(.$y, predict(.)) } }) , cvAUROC = map2_dbl(cv, polynomial, function(i.fold, i) { AUROC(dataset.train %>% filter(fold.index == i.fold) %>% select(mpg) %>% unlist, predict(cv.model.list[[i.fold]][[i]], newdata = dataset.train %>% filter(fold.index == i.fold))) } ) ) cv.df # # A tibble: 50 x 4 # cv polynomial trainAUROC cvAUROC # <int> <int> <dbl> <dbl> # 1 1 1 22.5 19.3 # 2 1 2 18.9 16.3 # 3 1 3 18.9 16.3 # 4 1 4 18.5 17.4 # 5 1 5 17.6 17.4 # 6 1 6 17.4 16.9 # 7 1 7 17.2 16.3 # 8 1 8 17.0 17.9 # 9 1 9 17.0 17.7 # 10 1 10 17.0 17.6 # # ... with 40 more rows
  • 50. Calculate the (aggregated) training AUROC & (aggregated) cv AUROC for multiple models # Make a table that shows the (aggregated) training error and test error for each model ----- cv.df.summarize = cv.df %>% select(-cv) %>% group_by(polynomial) %>% summarize_all(mean) cv.df.summarize # # A tibble: 10 x 3 # polynomial trainAUROC cvAUROC # <int> <dbl> <dbl> # 1 1 0.779 0.782 # 2 2 0.779 0.783 # 3 3 0.779 0.780 # 4 4 0.783 0.777 # 5 5 0.783 0.772 # 6 6 0.786 0.763 # 7 7 0.789 0.758 # 8 8 0.787 0.753 # 9 9 0.787 0.749 # 10 10 0.803 0.761
  • 51. Visualize the (aggregated) training AUROC & cv AUROC for multiple models cv.df.summarize %>% gather(key, value, trainAUROC, cvAUROC) %>% ggplot(aes(x = polynomial, y = value, color = key)) + geom_point() + geom_line()