Dataminingwithcaretpackage
Kai Xiao and Vivian Zhang @Supstat Inc.
Outline
Introduction of data mining and caret
before model training
building model
advance topic
exercise
·
·
visualization
pre-processing
Data slitting
-
-
-
·
Model training and Tuning
Model performance
variable importance
-
-
-
·
feature selection
parallel processing
-
-
·
/
cross-industry standard process for data mining
/
Introduction of caret
The caret package (short for Classification And REgression Training) is a set of functions that
attempt to streamline the process for creating predictive models. The package contains tools for:
data splitting
pre-processing
feature selection
model tuning using resampling
variable importance estimation
·
·
·
·
·
/
A very simple example
library(caret)
str(iris)
set.seed(1)
#preprocess
process<-preProcess(iris[,-5],method=c('center','scale'))
dataScaled<-predict(process,iris[,-5])
#datasplitting
inTrain<-createDataPartition(iris$Species,p=0.75)[[1]]
length(inTrain)
trainData<-dataScaled[inTrain,]
trainClass<-iris[inTrain,5]
testData<-dataScaled[-inTrain,]
testClass<-iris[-inTrain,5]
/
A very simple example
#modeltuning
set.seed(1)
fitControl<-trainControl(method="cv",
number=10)
tunedf<- data.frame(.cp=c(0.01,0.05,0.1,0.3,0.5))
treemodel<-train(x=trainData,
y=trainClass,
method='rpart',
trControl=fitControl,
tuneGrid=tunedf)
print(treemodel)
plot(treemodel)
#predictionandperformanceassessment
treePred<-predict(treemodel,testData)
confusionMatrix(treePred,testClass)
/
visualizations
The featurePlot function is a wrapper for different lattice plots to visualize the data.
Scatterplot Matrix
boxplot
featurePlot(x=iris[,1:4],
y=iris$Species,
plot="pairs",
##Addakeyatthetop
auto.key=list(columns=3))
featurePlot(x=iris[,1:4],
y=iris$Species,
plot="box",
##Addakeyatthetop
auto.key=list(columns=3))
/
pre-processing
Creating Dummy Variables
when<-data.frame(time=c("afternoon","night","afternoon",
"morning","morning","morning",
"morning","afternoon","afternoon"))
when
levels(when$time)<-c("morning","afternoon","night")
mainEffects<-dummyVars(~time,data=when)
predict(mainEffects,when)
/
pre-processing
Zero- and Near Zero-Variance Predictors
data<-data.frame(x1=rnorm(100),
x2=runif(100),
x3=rep(c(0,1),times=c(2,98)),
x4=rep(3,length=100))
nzv<-nearZeroVar(data,saveMetrics=TRUE)
nzv
nzv<-nearZeroVar(data)
dataFilted<-data[,-nzv]
head(dataFilted)
/
pre-processing
Identifying Correlated Predictors
set.seed(1)
x1<-rnorm(100)
x2<-x1+rnorm(100,0.1,0.1)
x3<-x1+rnorm(100,1,1)
data<-data.frame(x1,x2,x3)
corrmatrix<-cor(data)
highlyCor<-findCorrelation(corrmatrix,cutoff=0.75)
dataFilted<-data[,-highlyCor]
head(dataFilted)
/
pre-processing
Identifying Linear Dependencies Predictors
set.seed(1)
x1<-rnorm(100)
x2<-x1+rnorm(100,0.1,0.1)
x3<-x1+rnorm(100,1,1)
x4<-x2+x3
data<-data.frame(x1,x2,x3,x4)
comboInfo<-findLinearCombos(data)
dataFilted<-data[,-comboInfo$remove]
head(dataFilted)
/
pre-processing
Centering and Scaling
set.seed(1)
x1<-rnorm(100)
x2<-3+3*x1+rnorm(100)
x3<-2+2*x1+rnorm(100)
data<-data.frame(x1,x2,x3)
summary(data)
preProc<-preProcess(data,method=c("center","scale"))
dataProced<-predict(preProc,data)
summary(dataProced)
/
pre-processing
Imputation:bagImpute/knnImpute/
data<-iris[,-5]
data[1,2]<-NA
data[2,1]<-NA
impu<-preProcess(data,method='knnImpute')
dataProced<-predict(impu,data)
/
pre-processing
transformation: BoxCox/PCA
data<-iris[,-5]
pcaProc<-preProcess(data,method='pca')
dataProced<-predict(pcaProc,data)
head(dataProced)
/
data splitting
create balanced splits of the data
set.seed(1)
trainIndex<-createDataPartition(iris$Species,p=0.8,list=FALSE, times=1)
head(trainIndex)
irisTrain<-iris[trainIndex,]
irisTest<-iris[-trainIndex,]
summary(irisTest$Species)
createResample can be used to make simple bootstrap samples
createFolds can be used to generate balanced cross–validation groupings from a set of data.
·
·
/
Model Training and Parameter Tuning
The train function can be used to
evaluate, using resampling, the effect of model tuning parameters on performance
choose the "optimal" model across these parameters
estimate model performance from a training set
·
·
·
/
Model Training and Parameter Tuning
prepare data
data(PimaIndiansDiabetes2,package='mlbench')
data<-PimaIndiansDiabetes2
library(caret)
#scaleandcenter
preProcValues<-preProcess(data[,-9],method=c("center","scale"))
scaleddata<-predict(preProcValues,data[,-9])
#YeoJohnsontransformation
preProcbox<-preProcess(scaleddata,method=c("YeoJohnson"))
boxdata<-predict(preProcbox,scaleddata)
/
Model Training and Parameter Tuning
prepare data
#bagimpute
preProcimp<-preProcess(boxdata,method="bagImpute")
procdata<-predict(preProcimp,boxdata)
procdata$class<-data[,9]
#datasplitting
inTrain<-createDataPartition(procdata$class,p=0.75)[[1]]
length(inTrain)
trainData<-procdata[inTrain,1:8]
trainClass<-procdata[inTrain,9]
testData<-procdata[-inTrain,1:8]
testClass<-procdata[-inTrain,9]
/
Model Training and Parameter Tuning
define sets of model parameter values to evaluate
tunedf<- data.frame(.cp=seq(0.001,0.2,length.out=10))
/
Model Training and Parameter Tuning
define the type of resampling method
k-fold cross-validation (once or repeated)
leave-one-out cross-validation
bootstrap (simple estimation or the 632 rule)
·
·
·
fitControl<-trainControl(method="repeatedcv",
#10-foldcrossvalidation
number=10,
#repeated3times
repeats=3)
/
Model Training and Parameter Tuning
start training
treemodel<-train(x=trainData,
y=trainClass,
method='rpart',
trControl=fitControl,
tuneGrid=tunedf)
/
Model Training and Parameter Tuning
look at the final result
treemodel
plot(treemodel)
/
The trainControl Function
method: The resampling method
number and repeats: number controls with the number of folds in K-fold cross-validation or
number of resampling iterations for bootstrapping and leave-group-out cross-validation.
verboseIter: A logical for printing a training log.
returnData: A logical for saving the data into a slot called trainingData.
classProbs: a logical value determining whether class probabilities should be computed for held-
out samples during resample.
summaryFunction: a function to compute alternate performance summaries.
selectionFunction: a function to choose the optimal tuning parameters.
returnResamp: a character string containing one of the following values: "all", "final" or "none".
This specifies how much of the resampled performance measures to save.
·
·
·
·
·
·
·
·
/
Alternate Performance Metrics
Performance Metrics:
Another built-in function, twoClassSummary, will compute the sensitivity, specificity and area under
the ROC curve
regression: RMSE and R2
classification: accuracy and Kappa
·
·
fitControl<-trainControl(method="repeatedcv",
number=10,
repeats=3,
classProbs=TRUE,
summaryFunction=twoClassSummary)
treemodel<-train(x=trainData,
y=trainClass,
method='rpart',
trControl=fitControl,
tuneGrid=tunedf,
metric="ROC")
treemodel
/
Extracting Predictions
Predictions can be made from these objects as usual.
pre<-predict(treemodel,testData)
pre<-predict(treemodel,testData,type="prob")
/
Evaluating Test Sets
caret also contains several functions that can be used to describe the performance of classification
models
testPred<-predict(treemodel,testData)
testPred.prob<-predict(treemodel,testData,type='prob')
postResample(testPred,testClass)
confusionMatrix(testPred,testClass)
/
Exploring and Comparing Resampling
Distributions
Within-Model Comparing·
densityplot(treemodel,pch="|")
/
Exploring and Comparing Resampling
Distributions
Between-Models Comparing
let's build a nnet model, and compare these two model performance
·
·
tunedf<-expand.grid(.decay=0.1,
.size=1:8,
.bag=T)
nnetmodel<-train(x=trainData,
y=trainClass,
method='avNNet',
trControl=fitControl,
trace=F,
linout=F,
metric="ROC",
tuneGrid=tunedf)
nnetmodel
/
Exploring and Comparing Resampling
Distributions
Given these models, can we make statistical statements about their performance differences? To do
this, we first collect the resampling results using resamples.
We can compute the differences, then use a simple t-test to evaluate the null hypothesis that there is
no difference between models.
resamps<-resamples(list(tree=treemodel,
nnet=nnetmodel))
bwplot(resamps)
densityplot(resamps,metric='ROC')
difValues<-diff(resamps)
summary(difValues)
/
Variable importance evaluation
Variable importance evaluation functions can be separated into two groups:
model-based approach
Model Independent approach
·
·
For classification, ROC curve analysis is conducted on each predictor.
For regression, the relationship between each predictor and the outcome is evaluated
-
-
#model-basedapproach
treeimp<-varImp(treemodel)
plot(treeimp)
#ModelIndependentapproach
RocImp<-varImp(treemodel,useModel=FALSE)
plot(RocImp)
#or
RocImp<-filterVarImp(x=trainData,y=trainClass)
plot(RocImp)
/
feature selection
Many models do not necessarily use all the predictors
Feature Selection Using Search Algorithms("wrapper" approach)
Feature Selection Using Univariate Filters('filter' approach)
·
·
·
/
feature selection: wrapper approach
/
feature selection: wrapper approach
feature selection based on random forest model
pre-defined sets of functions: linear regression(lmFuncs), random forests (rfFuncs), naive Bayes
(nbFuncs), bagged trees (treebagFuncs)
ctrl<-rfeControl(functions=rfFuncs,
method="repeatedcv",
number=10,
repeats=3,
verbose=FALSE,
returnResamp="final")
Profile<-rfe(x=trainData,
y=trainClass,
sizes=1:8,
rfeControl=ctrl)
Profile
/
feature selection: wrapper approach
feature selection based on custom model
tunedf<- data.frame(.cp=seq(0.001,0.2,length.out=5))
fitControl<-trainControl(method="repeatedcv",
number=10,
repeats=3,
classProbs=TRUE,
summaryFunction=twoClassSummary)
customFuncs<-caretFuncs
customFuncs$summary<-twoClassSummary
ctrl<-rfeControl(functions=customFuncs,
method="repeatedcv",
number=10,
repeats=3,
verbose=FALSE,
returnResamp="final")
Profile<-rfe(x=trainData,
y=trainClass,
sizes=1:8,
method='rpart',
rfeControl=ctrl, /
parallel processing
system.time({
library(doParallel)
registerDoParallel(cores=2)
nnetmodel.para<-train(x=trainData,
y=trainClass,
method='avNNet',
trControl=fitControl,
trace=F,
linout=F,
metric="ROC",
tuneGrid=tunedf)
})
nnetmodel$times
nnetmodel.para$times
/
exercise-1
use knn method to train model
library(caret)
fitControl<-trainControl(method="repeatedcv",
number=10,
repeats=3)
tunedf<-data.frame(.k=seq(3,20,by=2))
knnmodel<-train(x=trainData,
y=trainClass,
method='knn',
trControl=fitControl,
tuneGrid=tunedf)
plot(knnmodel)
/

Data mining with caret package