M11 bagging loo cv

Bootstrap Aggregation-LooCV-naiveBayes file:///E:/users/rkannan/cuny/fall2020/fall2020/m11-bagging-loocv/baggi...
1 of 11 11/23/2020, 5:39 PM

library('e1071')
file<-'c://Users/rk215/Data/heart.csv'
heart<-read.csv(file,head=T,sep=',',stringsAsFactors=F)
head(heart)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca tha
l
## 1 63 1 3 145 233 1 0 150 0 2.3 0 0
1
## 2 37 1 2 130 250 0 1 187 0 3.5 0 0
2
## 3 41 0 1 130 204 0 0 172 0 1.4 2 0
2
## 4 56 1 1 120 236 0 1 178 0 0.8 2 0
2
## 5 57 0 0 120 354 0 1 163 1 0.6 2 0
2
## 6 57 1 0 140 192 0 1 148 0 0.4 1 0
1
## target
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
catheart<-heart[,c(2,3,6,7,9,11,12,13,14)]
set.seed(43)
trdidx<-sample(1:nrow(catheart),0.7*nrow(catheart),replace=F)
trcatheart<-catheart[trdidx,]
tstcatheart<-catheart[-trdidx,]
nb.model<-naiveBayes(target~.,data=trcatheart)
#str(nb.model)
object.size(nb.model) #11096
## 11096 bytes
nb.tstpred<-predict(nb.model,tstcatheart[,-c(9)],type='raw')
nb.tstclass<-unlist(apply(round(nb.tstpred),1,which.max))-1
nb.tbl<-table(tstcatheart[[9]], nb.tstclass)
nb.cfm<-caret::confusionMatrix(nb.tbl)
nb.cfm
2 of 11 11/23/2020, 5:39 PM

## Confusion Matrix and Statistics
##
## nb.tstclass
## 0 1
## 0 28 12
## 1 3 48
##
## Accuracy : 0.8352
## 95% CI : (0.7427, 0.9047)
## No Information Rate : 0.6593
## P-Value [Acc > NIR] : 0.0001482
##
## Kappa : 0.6571
##
## Mcnemar's Test P-Value : 0.0388671
##
## Sensitivity : 0.9032
## Specificity : 0.8000
## Pos Pred Value : 0.7000
## Neg Pred Value : 0.9412
## Prevalence : 0.3407
## Detection Rate : 0.3077
## Detection Prevalence : 0.4396
## Balanced Accuracy : 0.8516
##
## 'Positive' Class : 0
##
start_tm <- proc.time()
df<-trcatheart
runModel<-function(df) {naiveBayes(target~.,data=df[sample(1:nrow(df),nrow(d
f),replace=T),])}
lapplyrunmodel<-function(x)runModel(df)
system.time(models<-lapply(1:100,lapplyrunmodel))
## user system elapsed
## 0.32 0.02 0.33
object.size(models)
## 1110448 bytes
end_tm<-proc.time()
print(paste("time taken to run 100 bootstrapps",(end_tm-start_tm),sep=":"))
3 of 11 11/23/2020, 5:39 PM

## [1] "time taken to run 100 bootstrapps:0.46"
## [4] "time taken to run 100 bootstrapps:NA"
## [5] "time taken to run 100 bootstrapps:NA"
bagging_preds<-lapply(models,FUN=function(M,D=tstcatheart[,-c(9)])predict(M,
D,type='raw'))
bagging_cfm<-lapply(bagging_preds,FUN=function(P,A=tstcatheart[[9]])
{pred_class<-unlist(apply(round(P),1,which.max))-1
pred_tbl<-table(A,pred_class)
pred_cfm<-caret::confusionMatrix(pred_tbl)
pred_cfm
})
bagging.perf<-as.data.frame(do.call('rbind',lapply(bagging_cfm,FUN=function
(cfm)c(cfm$overall,cfm$byClass))))
bagging.perf.mean<-apply(bagging.perf[bagging.perf$AccuracyPValue<0.01,-c(6:
7)],2,mean)
bagging.perf.var<-apply(bagging.perf[bagging.perf$AccuracyPValue<0.01,-c(6:
7)],2,sd)
bagging.perf.var
## Accuracy Kappa AccuracyLower
## 0.01618750 0.03355331 0.01846838
## AccuracyUpper AccuracyNull Sensitivity
## 0.01273569 0.01795716 0.03073122
## Specificity Pos Pred Value Neg Pred Value
## 0.01470108 0.02693220 0.02200582
## Precision Recall F1
## 0.02693220 0.03073122 0.02087685
## Prevalence Detection Rate Detection Prevalence
## 0.01795716 0.01183833 0.00000000
## Balanced Accuracy
## 0.01875328
bagging.perf.mean
4 of 11 11/23/2020, 5:39 PM

## Accuracy Kappa AccuracyLower
## 0.8323565 0.6521225 0.7396540
## AccuracyUpper AccuracyNull Sensitivity
## 0.9023711 0.6496947 0.8891540
## Specificity Pos Pred Value Neg Pred Value
## 0.8025070 0.7077778 0.9300654
## Precision Recall F1
## 0.7077778 0.8891540 0.7876655
## Prevalence Detection Rate Detection Prevalence
## 0.3503053 0.3111111 0.4395604
## Balanced Accuracy
## 0.8458305
(bagging_tm<-proc.time()-start_tm)
## user system elapsed
## 2.35 0.02 2.36
N<-nrow(trcatheart)
cv_df<-do.call('rbind',lapply(1:N,FUN=function(idx,data=trcatheart) { # For
each observation
m<-naiveBayes(target~.,data=data[-idx,]) # train with ALL other observatio
ns
p<-predict(m,data[idx,-c(9)],type='raw') # predict that one observation
# NB returns the probabilities of the classes, as per Bayesian Classifie
r,we take the classs with the higher probability
pc<-unlist(apply(round(p),1,which.max))-1 # -1 to make class to be 0 or
1, which.max returns 1 or 2
#pred_tbl<-table(data[idx,c(9)],pc)
#pred_cfm<-caret::confusionMatrix(pred_tbl)
list(fold=idx,m=m,predicted=pc,actual=data[idx,c(9)]) # store the idx, mod
el, predicted class and actual class
}
))
cv_df<-as.data.frame(cv_df)
head(cv_df)
5 of 11 11/23/2020, 5:39 PM

## fold
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
##
m
## 1 98, 113, 0.8061224, 0.5840708, 0.3973667, 0.4950769, 0.4387755, 1.35398
2, 0.8623431, 0.9535681, 0.1428571, 0.1327434, 0.3517262, 0.3408085, 0.43877
55, 0.6106195, 0.5385419, 0.5076843, 0.5510204, 0.1769912, 0.4999474, 0.3833
613, 1.132653, 1.530973, 0.5493718, 0.6277909, 1.285714, 0.3539823, 1.03545
4, 0.8856026, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, TR
UE, TRUE, TRUE, TRUE, TRUE, naiveBayes.default(x = X, y = Y, laplace = lapla
ce)
## 2 97, 114, 0.8041237, 0.5789474, 0.3989354, 0.4959078, 0.443299, 1.
342105, 0.8656533, 0.9577716, 0.1443299, 0.1315789, 0.3532495, 0.3395249, 0.
443299, 0.6052632, 0.5394649, 0.5086582, 0.5463918, 0.1754386, 0.5004294, 0.
382021, 1.123711, 1.526316, 0.5450102, 0.6269822, 1.278351, 0.3508772, 1.038
25, 0.8822984, 2.57732, 2.122807, 0.658835, 0.4822578, TRUE, TRUE, TRUE, TRU
E, TRUE, TRUE, TRUE, TRUE, naiveBayes.default(x = X, y = Y, laplace = laplac
e)
## 3 97, 114, 0.8041237, 0.5789474, 0.3989354, 0.4959078, 0.443299, 1.
342105, 0.8656533, 0.9577716, 0.1443299, 0.1315789, 0.3532495, 0.3395249, 0.
443299, 0.6052632, 0.5394649, 0.5086582, 0.5463918, 0.1754386, 0.5004294, 0.
382021, 1.14433, 1.526316, 0.5398628, 0.6269822, 1.298969, 0.3508772, 1.0324
e)
## 4 98, 113, 0.8061224, 0.5752212, 0.3973667, 0.4965112, 0.4387755, 1.33628
3, 0.8623431, 0.9600095, 0.1428571, 0.1327434, 0.3517262, 0.3408085, 0.43877
55, 0.6017699, 0.5385419, 0.5095485, 0.5510204, 0.1769912, 0.4999474, 0.3833
613, 1.132653, 1.522124, 0.5493718, 0.6281683, 1.285714, 0.3539823, 1.03545
4, 0.8856026, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, TR
ce)
## 5 98, 113, 0.8061224, 0.5840708, 0.3973667, 0.4950769, 0.4387755, 1.35398
2, 0.8623431, 0.9535681, 0.1428571, 0.1327434, 0.3517262, 0.3408085, 0.43877
55, 0.6017699, 0.5385419, 0.5095485, 0.5510204, 0.1769912, 0.4999474, 0.3833
613, 1.132653, 1.522124, 0.5493718, 0.6281683, 1.285714, 0.3539823, 1.03545
4, 0.8856026, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, TR
ce)
## 6 97, 114, 0.814433, 0.5789474, 0.3907764, 0.4959078, 0.443299, 1.
342105, 0.8656533, 0.9577716, 0.1340206, 0.1315789, 0.3424442, 0.3395249, 0.
4329897, 0.6052632, 0.5382691, 0.5086582, 0.5463918, 0.1754386, 0.5004294,
0.382021, 1.134021, 1.526316, 0.552058, 0.6269822, 1.278351, 0.3508772, 1.03
825, 0.8822984, 2.57732, 2.122807, 0.658835, 0.4822578, TRUE, TRUE, TRUE, TR
6 of 11 11/23/2020, 5:39 PM

ce)
## predicted actual
## 1 1 1
## 2 0 0
## 3 0 0
## 4 1 1
## 5 1 1
## 6 0 0
tail(cv_df)
7 of 11 11/23/2020, 5:39 PM

## fold
## 207 207
## 208 208
## 209 209
## 210 210
## 211 211
## 212 212
##
m
## 207 98, 113, 0.8061224, 0.5840708, 0.3973667, 0.4950769, 0.4387755, 1.336
283, 0.8623431, 0.9600095, 0.1428571, 0.1238938, 0.3517262, 0.3309279, 0.438
7755, 0.6017699, 0.5385419, 0.5095485, 0.5510204, 0.1769912, 0.4999474, 0.38
33613, 1.132653, 1.522124, 0.5493718, 0.6281683, 1.285714, 0.3539823, 1.0354
54, 0.8856026, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, T
RUE, TRUE, TRUE, TRUE, TRUE, naiveBayes.default(x = X, y = Y, laplace = lapl
ace)
## 208 98, 113, 0.8061224, 0.5752212, 0.3973667, 0.4965112, 0.4387755, 1.345
133, 0.8623431, 0.9614898, 0.1428571, 0.1238938, 0.3517262, 0.3309279, 0.438
7755, 0.6106195, 0.5385419, 0.5076843, 0.5510204, 0.1769912, 0.4999474, 0.38
33613, 1.132653, 1.522124, 0.5493718, 0.6281683, 1.285714, 0.3539823, 1.0354
54, 0.8856026, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, T
ace)
## 209 98, 113, 0.8061224, 0.5752212, 0.3973667, 0.4965112, 0.4387755, 1.353
982, 0.8623431, 0.9535681, 0.1428571, 0.1327434, 0.3517262, 0.3408085, 0.438
7755, 0.6017699, 0.5385419, 0.5095485, 0.5510204, 0.1769912, 0.4999474, 0.38
33613, 1.132653, 1.522124, 0.5493718, 0.6281683, 1.285714, 0.3539823, 1.0354
54, 0.8856026, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, T
ace)
## 210 98, 113, 0.8061224, 0.5840708, 0.3973667, 0.4950769, 0.4387755, 1.345
133, 0.8623431, 0.9614898, 0.1428571, 0.1238938, 0.3517262, 0.3309279, 0.438
7755, 0.6106195, 0.5385419, 0.5076843, 0.5510204, 0.1681416, 0.4999474, 0.37
56579, 1.132653, 1.522124, 0.5493718, 0.6281683, 1.285714, 0.3451327, 1.0354
54, 0.8840846, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, T
ace)
## 211 97, 114, 0.8041237, 0.5789474, 0.3989354, 0.4959078, 0.443299,
1.342105, 0.8656533, 0.9577716, 0.1443299, 0.1315789, 0.3532495, 0.3395249,
0.4329897, 0.6052632, 0.5382691, 0.5086582, 0.5463918, 0.1754386, 0.5004294,
0.382021, 1.134021, 1.526316, 0.552058, 0.6269822, 1.28866, 0.3508772, 1.040
e)
## 212 98, 113, 0.8061224, 0.5840708, 0.3973667, 0.4950769, 0.4387755, 1.345
133, 0.8623431, 0.9614898, 0.1428571, 0.1327434, 0.3517262, 0.3408085, 0.438
7755, 0.6106195, 0.5385419, 0.5076843, 0.5510204, 0.1769912, 0.4999474, 0.38
33613, 1.132653, 1.522124, 0.5493718, 0.6281683, 1.285714, 0.3539823, 1.0354
54, 0.8856026, 2.581633, 2.123894, 0.6568194, 0.4842657, TRUE, TRUE, TRUE, T
8 of 11 11/23/2020, 5:39 PM

ace)
## predicted actual
## 207 1 1
## 208 1 1
## 209 1 1
## 210 1 1
## 211 0 0
## 212 1 1
table(as.numeric(cv_df$actual)==as.numeric(cv_df$predicted))
##
## FALSE TRUE
## 34 178
loocv_tbl<-table(as.numeric(cv_df$actual),as.numeric(cv_df$predicted))
sum(diag(loocv_tbl))/sum(loocv_tbl)
## [1] 0.8396226
(loocv_caret_cfm<-caret::confusionMatrix(loocv_tbl))
9 of 11 11/23/2020, 5:39 PM

##
##
## 0 1
## 0 79 19
## 1 15 99
##
## 95% CI : (0.7832, 0.8863)
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6765
##
##
##
##
# now we have to apply the training models to testdata and average them
# since this is classification we will take the majority vote
# double loop
tstcv.perf<-as.data.frame(do.call('cbind',lapply(cv_df$m,FUN=function(m,data
=tstcatheart)
{
v<-predict(m,data[,-c(9)],type='raw')
lbllist<-unlist(apply(round(v),1,which.max))-1
}
)))
np<-ncol(tstcv.perf)
predclass<-unlist(apply(tstcv.perf,1,FUN=function(v){ ifelse(sum(v[2:length
(v)])/np<0.5,0,1)}))
loocvtbl<-table(tstcatheart[,c(9)],predclass)
(loocv_cfm<-caret::confusionMatrix(loocvtbl))
10 of 11 11/23/2020, 5:39 PM

##
## predclass
## 0 1
## 0 28 12
## 1 3 48
##
## 95% CI : (0.7427, 0.9047)
## P-Value [Acc > NIR] : 0.0001482
##
## Kappa : 0.6571
##
##
##
##
print(paste('Bagging:',bagging.perf.mean[1]))
## [1] "Bagging: 0.832356532356532"
print(paste('LOO-CV:',loocv_cfm$overall[1]))
## [1] "LOO-CV: 0.835164835164835"
print(paste('Base NB',nb.cfm$overall[[1]]))
## [1] "Base NB 0.835164835164835"
11 of 11 11/23/2020, 5:39 PM

M11 bagging loo cv

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to M11 bagging loo cv

Similar to M11 bagging loo cv (20)

More from Raman Kannan

More from Raman Kannan (20)

Recently uploaded

Recently uploaded (20)

M11 bagging loo cv