SlideShare a Scribd company logo
1 of 8
Download to read offline
WLE-classify
HUNG HUO-SU
09/23/2015
Practical Machine Learning Project 09/24/2015
WLE Data analysis
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
training_csv = read.csv("pml-training.csv")
#Partition Original training data with classe into 2 part
#70% for training model and 30% for verification
inTrain <- createDataPartition(y=training_csv$classe, p=0.7, list=FALSE)
training <- training_csv[inTrain,]
testing <- training_csv[-inTrain,]
#We just only focus on accelerometers, and ignore others sensor
#training_accel contains only accelerometers data without classe
#training_accel_classe contains only accelerometers with classe.
training_accel <- training[grep("^accel", colnames(training))]
training_accel_classe<-cbind(training_accel, training$classe)
colnames(training_accel_classe)[ncol(training_accel_classe)] <-
"classe"
colnames(training_accel_classe)[ncol(training_accel_classe)]
## [1] "classe"
#Use Random Forests method to train the model called modelFit_rf_70
modelFit_rf_70 <- train(training_accel_classe$classe ~ ., data=training
_accel_classe , method="rf", prof=TRUE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
#The accuracy is over 90% and mtry =2 is best.
modelFit_rf_70
## Random Forest
##
## 13737 samples
## 12 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737,
...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9305551 0.9121133 0.003224900 0.004081289
## 7 0.9210913 0.9001473 0.003691518 0.004695061
## 12 0.9033667 0.8777115 0.005626041 0.007144838
##
## Accuracy was used to select the optimal model using the largest val
ue.
## The final value used for the model was mtry = 2.
#Use confusionMatrix to verify model accuracy
#The accuracy of model created by Random Forest is high and over 0.9
confusionMatrix(testing$classe, predict(modelFit_rf_70, testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1613 8 26 23 4
## B 43 1039 35 14 8
## C 13 37 969 6 1
## D 19 4 47 889 5
## E 5 15 6 10 1046
##
## Overall Statistics
##
## Accuracy : 0.9441
## 95% CI : (0.9379, 0.9498)
## No Information Rate : 0.2877
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9293
## Mcnemar's Test P-Value : 9.14e-12
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9527 0.9420 0.8947 0.9437 0.9831
## Specificity 0.9854 0.9791 0.9881 0.9848 0.9925
## Pos Pred Value 0.9636 0.9122 0.9444 0.9222 0.9667
## Neg Pred Value 0.9810 0.9865 0.9765 0.9892 0.9963
## Prevalence 0.2877 0.1874 0.1840 0.1601 0.1808
## Detection Rate 0.2741 0.1766 0.1647 0.1511 0.1777
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9691 0.9605 0.9414 0.9643 0.9878
#Fill the predict result to predRight column
pred <- predict(modelFit_rf_70, testing)
testing$predRight <- pred==testing$classe
#Predict the answers of pml-testing.csv, and get result
testing_csv = read.csv("pml-testing.csv")
answers <- predict(modelFit_rf_70, testing_csv)
answers
## [1] B A C A A E D D A A B C B A E E A B B B
## Levels: A B C D E
#Use pml_write_files() function to create answer of file for 20 problem
s
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.name
s=FALSE)
}
}
pml_write_files(answers)
#Although we get over 90% accuracy by testing data from 30% of original
data, we still want to know which conditions cause error prediction.
#We use correlation matrix between factors and find out some less relat
ion factors in order to show the test result in graphic.
#For example, we get the most TRUE column is accel_forearm_y, and
#find the other factor less relation with it.
min(abs(cor(training_accel[which(training_accel_classe$classe == "A
"),])))
## [1] 0.01247164
#Use min_cor_rcname() funciton can retrive row/column name of minimal c
orrelation value for each classe
min_cor_rcname <- function(Class)
{
mdat <- abs(cor(training_accel[which(training_accel_classe$classe ==
Class),]))
index <- which.min(mdat)
k <- arrayInd(index, dim(mdat))
rr <- rownames(mdat)[k[,1]]
cc <- colnames(mdat)[k[,2]]
print(rr)
print(cc)
}
min_cor_rcname("A")
## [1] "accel_belt_y"
## [1] "accel_belt_x"
min_cor_rcname("B")
## [1] "accel_forearm_x"
## [1] "accel_dumbbell_y"
min_cor_rcname("C")
## [1] "accel_forearm_y"
## [1] "accel_arm_y"
min_cor_rcname("D")
## [1] "accel_forearm_y"
## [1] "accel_belt_z"
min_cor_rcname("E")
## [1] "accel_forearm_z"
## [1] "accel_dumbbell_y"
# Divide testing data by classe, because we want to observe error by ea
ch data
testing_A <- testing[which(testing$classe == "A"),]
testing_B <- testing[which(testing$classe == "B"),]
testing_C <- testing[which(testing$classe == "C"),]
testing_D <- testing[which(testing$classe == "D"),]
testing_E <- testing[which(testing$classe == "E"),]
#Plot graphs for each Classe A,B,C,D,E
qplot(accel_belt_x, accel_belt_y, colour=predict(modelFit_rf_70, testin
g_A), data=testing_A, main="Class A")
qplot(accel_dumbbell_x, accel_belt_z, colour=predict(modelFit_rf_70, te
sting_B), data=testing_B, main="Class B")
qplot(accel_belt_y, accel_belt_x, colour=predict(modelFit_rf_70, testin
g_C), data=testing_C, main="Class C")
qplot(accel_belt_x, accel_forearm_x, colour=predict(modelFit_rf_70, tes
ting_D), data=testing_D, main = "Class D")
qplot(accel_dumbbell_y, accel_forearm_z, colour=predict(modelFit_rf_70,
testing_E), data=testing_E, main="Class E")
Summary
• 1.Random Forest algorithm have high accuracy but performance is bad.It need
much time to train model.*
• 2.Most errors happen near the center of each group of each Class, but it is still
predicted error. It may be caused by overfitting.It is better to reduce the
features before train model by Random Forest method.*
• 3.Per the graphs we generate, they imply something:*
– Some error classifications of A are considered as B.*
– Some error classifications of B are considered as A or C.*
– Some error classifications of C are considered as A.*
– Some error classifications of D are considered as A.*
– Some error classifications of E are considered as B.*
• 4.According to the page http://groupware.les.inf.puc-rio.br/har "Weight Lifting
Exercises Dataset". It declares*
– Class A - the specification exercise.*
– Class B - throwing the elbows to the front*
– Class C - lifting the dumbbell only halfway*
– Class D - lowering the dumbbell only halfway*
– Class E - throwing the hips to the front*
• 5.when we do the specified exercise,if we make the mistake about throwing our
hips to the front, it might make our elbows to the front at the same time.*
• 6.The most important variable is accel_belt_z and then accel_dumbbell_y by
GINI importance.*

More Related Content

Viewers also liked

Virtual Private Network
Virtual Private NetworkVirtual Private Network
Virtual Private NetworkYana Altunyan
 
Media Relations - An overview
Media Relations - An overviewMedia Relations - An overview
Media Relations - An overviewDC Priyan
 
Вконтакте размещение рекламы
Вконтакте размещение рекламыВконтакте размещение рекламы
Вконтакте размещение рекламыHiconversion
 
ULI Geoforum - Exempel på nyttjande av öppna data
ULI Geoforum - Exempel på nyttjande av öppna data ULI Geoforum - Exempel på nyttjande av öppna data
ULI Geoforum - Exempel på nyttjande av öppna data Kristofer Sjöholm
 
Investments to sustainable development
Investments to sustainable developmentInvestments to sustainable development
Investments to sustainable developmentSyful Islam
 
How to write an argumentative essay
How to write an argumentative essayHow to write an argumentative essay
How to write an argumentative essayEssayAcademy
 

Viewers also liked (12)

ag_resume13 (2) (1)
ag_resume13 (2) (1)ag_resume13 (2) (1)
ag_resume13 (2) (1)
 
Virtual Private Network
Virtual Private NetworkVirtual Private Network
Virtual Private Network
 
Media Relations - An overview
Media Relations - An overviewMedia Relations - An overview
Media Relations - An overview
 
к о м п`ю т е р
к о м п`ю т е рк о м п`ю т е р
к о м п`ю т е р
 
Вконтакте размещение рекламы
Вконтакте размещение рекламыВконтакте размещение рекламы
Вконтакте размещение рекламы
 
TrackGSM
TrackGSMTrackGSM
TrackGSM
 
ULI Geoforum - Exempel på nyttjande av öppna data
ULI Geoforum - Exempel på nyttjande av öppna data ULI Geoforum - Exempel på nyttjande av öppna data
ULI Geoforum - Exempel på nyttjande av öppna data
 
Snurk
SnurkSnurk
Snurk
 
Investments to sustainable development
Investments to sustainable developmentInvestments to sustainable development
Investments to sustainable development
 
Μήλα και καμήλα
Μήλα και καμήλαΜήλα και καμήλα
Μήλα και καμήλα
 
Viking fitness
Viking fitnessViking fitness
Viking fitness
 
How to write an argumentative essay
How to write an argumentative essayHow to write an argumentative essay
How to write an argumentative essay
 

Similar to Classification examp

Course Project for Coursera Practical Machine Learning
Course Project for Coursera Practical Machine LearningCourse Project for Coursera Practical Machine Learning
Course Project for Coursera Practical Machine LearningJohn Edward Slough II
 
maXbox starter65 machinelearning3
maXbox starter65 machinelearning3maXbox starter65 machinelearning3
maXbox starter65 machinelearning3Max Kleiner
 
Data science with R - Clustering and Classification
Data science with R - Clustering and ClassificationData science with R - Clustering and Classification
Data science with R - Clustering and ClassificationBrigitte Mueller
 
RDataMining slides-regression-classification
RDataMining slides-regression-classificationRDataMining slides-regression-classification
RDataMining slides-regression-classificationYanchang Zhao
 
maXbox starter69 Machine Learning VII
maXbox starter69 Machine Learning VIImaXbox starter69 Machine Learning VII
maXbox starter69 Machine Learning VIIMax Kleiner
 
Lab 2: Classification and Regression Prediction Models, training and testing ...
Lab 2: Classification and Regression Prediction Models, training and testing ...Lab 2: Classification and Regression Prediction Models, training and testing ...
Lab 2: Classification and Regression Prediction Models, training and testing ...Yao Yao
 
wk5ppt1_Titanic
wk5ppt1_Titanicwk5ppt1_Titanic
wk5ppt1_TitanicAliciaWei1
 
Linear Regression (Machine Learning)
Linear Regression (Machine Learning)Linear Regression (Machine Learning)
Linear Regression (Machine Learning)Omkar Rane
 
Training course lect2
Training course lect2Training course lect2
Training course lect2Noor Dhiya
 
Peterson_-_Machine_Learning_Project
Peterson_-_Machine_Learning_ProjectPeterson_-_Machine_Learning_Project
Peterson_-_Machine_Learning_Projectjpeterson2058
 
Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...
Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...
Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...Yao Yao
 

Similar to Classification examp (20)

Course Project for Coursera Practical Machine Learning
Course Project for Coursera Practical Machine LearningCourse Project for Coursera Practical Machine Learning
Course Project for Coursera Practical Machine Learning
 
maXbox starter65 machinelearning3
maXbox starter65 machinelearning3maXbox starter65 machinelearning3
maXbox starter65 machinelearning3
 
Data science with R - Clustering and Classification
Data science with R - Clustering and ClassificationData science with R - Clustering and Classification
Data science with R - Clustering and Classification
 
wk5ppt2_Iris
wk5ppt2_Iriswk5ppt2_Iris
wk5ppt2_Iris
 
CSL0777-L07.pptx
CSL0777-L07.pptxCSL0777-L07.pptx
CSL0777-L07.pptx
 
Xgboost
XgboostXgboost
Xgboost
 
RDataMining slides-regression-classification
RDataMining slides-regression-classificationRDataMining slides-regression-classification
RDataMining slides-regression-classification
 
cluster(python)
cluster(python)cluster(python)
cluster(python)
 
maXbox starter69 Machine Learning VII
maXbox starter69 Machine Learning VIImaXbox starter69 Machine Learning VII
maXbox starter69 Machine Learning VII
 
DCSM report2
DCSM report2DCSM report2
DCSM report2
 
Lab 2: Classification and Regression Prediction Models, training and testing ...
Lab 2: Classification and Regression Prediction Models, training and testing ...Lab 2: Classification and Regression Prediction Models, training and testing ...
Lab 2: Classification and Regression Prediction Models, training and testing ...
 
wk5ppt1_Titanic
wk5ppt1_Titanicwk5ppt1_Titanic
wk5ppt1_Titanic
 
Xgboost
XgboostXgboost
Xgboost
 
eam2
eam2eam2
eam2
 
Linear Regression (Machine Learning)
Linear Regression (Machine Learning)Linear Regression (Machine Learning)
Linear Regression (Machine Learning)
 
Training course lect2
Training course lect2Training course lect2
Training course lect2
 
ML .pptx
ML .pptxML .pptx
ML .pptx
 
Peterson_-_Machine_Learning_Project
Peterson_-_Machine_Learning_ProjectPeterson_-_Machine_Learning_Project
Peterson_-_Machine_Learning_Project
 
Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...
Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...
Mini-lab 1: Stochastic Gradient Descent classifier, Optimizing Logistic Regre...
 
Backpropagation - Elisa Sayrol - UPC Barcelona 2018
Backpropagation - Elisa Sayrol - UPC Barcelona 2018Backpropagation - Elisa Sayrol - UPC Barcelona 2018
Backpropagation - Elisa Sayrol - UPC Barcelona 2018
 

Classification examp

  • 1. WLE-classify HUNG HUO-SU 09/23/2015 Practical Machine Learning Project 09/24/2015 WLE Data analysis library(caret) ## Loading required package: lattice ## Loading required package: ggplot2 training_csv = read.csv("pml-training.csv") #Partition Original training data with classe into 2 part #70% for training model and 30% for verification inTrain <- createDataPartition(y=training_csv$classe, p=0.7, list=FALSE) training <- training_csv[inTrain,] testing <- training_csv[-inTrain,] #We just only focus on accelerometers, and ignore others sensor #training_accel contains only accelerometers data without classe #training_accel_classe contains only accelerometers with classe. training_accel <- training[grep("^accel", colnames(training))] training_accel_classe<-cbind(training_accel, training$classe) colnames(training_accel_classe)[ncol(training_accel_classe)] <- "classe" colnames(training_accel_classe)[ncol(training_accel_classe)] ## [1] "classe" #Use Random Forests method to train the model called modelFit_rf_70 modelFit_rf_70 <- train(training_accel_classe$classe ~ ., data=training _accel_classe , method="rf", prof=TRUE) ## Loading required package: randomForest ## randomForest 4.6-10 ## Type rfNews() to see new features/changes/bug fixes. #The accuracy is over 90% and mtry =2 is best. modelFit_rf_70 ## Random Forest ## ## 13737 samples
  • 2. ## 12 predictor ## 5 classes: 'A', 'B', 'C', 'D', 'E' ## ## No pre-processing ## Resampling: Bootstrapped (25 reps) ## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737, ... ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 2 0.9305551 0.9121133 0.003224900 0.004081289 ## 7 0.9210913 0.9001473 0.003691518 0.004695061 ## 12 0.9033667 0.8777115 0.005626041 0.007144838 ## ## Accuracy was used to select the optimal model using the largest val ue. ## The final value used for the model was mtry = 2. #Use confusionMatrix to verify model accuracy #The accuracy of model created by Random Forest is high and over 0.9 confusionMatrix(testing$classe, predict(modelFit_rf_70, testing)) ## Confusion Matrix and Statistics ## ## Reference ## Prediction A B C D E ## A 1613 8 26 23 4 ## B 43 1039 35 14 8 ## C 13 37 969 6 1 ## D 19 4 47 889 5 ## E 5 15 6 10 1046 ## ## Overall Statistics ## ## Accuracy : 0.9441 ## 95% CI : (0.9379, 0.9498) ## No Information Rate : 0.2877 ## P-Value [Acc > NIR] : < 2.2e-16 ## ## Kappa : 0.9293 ## Mcnemar's Test P-Value : 9.14e-12 ## ## Statistics by Class: ## ## Class: A Class: B Class: C Class: D Class: E ## Sensitivity 0.9527 0.9420 0.8947 0.9437 0.9831 ## Specificity 0.9854 0.9791 0.9881 0.9848 0.9925 ## Pos Pred Value 0.9636 0.9122 0.9444 0.9222 0.9667 ## Neg Pred Value 0.9810 0.9865 0.9765 0.9892 0.9963 ## Prevalence 0.2877 0.1874 0.1840 0.1601 0.1808
  • 3. ## Detection Rate 0.2741 0.1766 0.1647 0.1511 0.1777 ## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839 ## Balanced Accuracy 0.9691 0.9605 0.9414 0.9643 0.9878 #Fill the predict result to predRight column pred <- predict(modelFit_rf_70, testing) testing$predRight <- pred==testing$classe #Predict the answers of pml-testing.csv, and get result testing_csv = read.csv("pml-testing.csv") answers <- predict(modelFit_rf_70, testing_csv) answers ## [1] B A C A A E D D A A B C B A E E A B B B ## Levels: A B C D E #Use pml_write_files() function to create answer of file for 20 problem s pml_write_files = function(x){ n = length(x) for(i in 1:n){ filename = paste0("problem_id_",i,".txt") write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.name s=FALSE) } } pml_write_files(answers) #Although we get over 90% accuracy by testing data from 30% of original data, we still want to know which conditions cause error prediction. #We use correlation matrix between factors and find out some less relat ion factors in order to show the test result in graphic. #For example, we get the most TRUE column is accel_forearm_y, and #find the other factor less relation with it. min(abs(cor(training_accel[which(training_accel_classe$classe == "A "),]))) ## [1] 0.01247164 #Use min_cor_rcname() funciton can retrive row/column name of minimal c orrelation value for each classe min_cor_rcname <- function(Class) { mdat <- abs(cor(training_accel[which(training_accel_classe$classe == Class),])) index <- which.min(mdat) k <- arrayInd(index, dim(mdat)) rr <- rownames(mdat)[k[,1]] cc <- colnames(mdat)[k[,2]]
  • 4. print(rr) print(cc) } min_cor_rcname("A") ## [1] "accel_belt_y" ## [1] "accel_belt_x" min_cor_rcname("B") ## [1] "accel_forearm_x" ## [1] "accel_dumbbell_y" min_cor_rcname("C") ## [1] "accel_forearm_y" ## [1] "accel_arm_y" min_cor_rcname("D") ## [1] "accel_forearm_y" ## [1] "accel_belt_z" min_cor_rcname("E") ## [1] "accel_forearm_z" ## [1] "accel_dumbbell_y" # Divide testing data by classe, because we want to observe error by ea ch data testing_A <- testing[which(testing$classe == "A"),] testing_B <- testing[which(testing$classe == "B"),] testing_C <- testing[which(testing$classe == "C"),] testing_D <- testing[which(testing$classe == "D"),] testing_E <- testing[which(testing$classe == "E"),] #Plot graphs for each Classe A,B,C,D,E qplot(accel_belt_x, accel_belt_y, colour=predict(modelFit_rf_70, testin g_A), data=testing_A, main="Class A")
  • 5. qplot(accel_dumbbell_x, accel_belt_z, colour=predict(modelFit_rf_70, te sting_B), data=testing_B, main="Class B")
  • 6. qplot(accel_belt_y, accel_belt_x, colour=predict(modelFit_rf_70, testin g_C), data=testing_C, main="Class C") qplot(accel_belt_x, accel_forearm_x, colour=predict(modelFit_rf_70, tes ting_D), data=testing_D, main = "Class D")
  • 8. Summary • 1.Random Forest algorithm have high accuracy but performance is bad.It need much time to train model.* • 2.Most errors happen near the center of each group of each Class, but it is still predicted error. It may be caused by overfitting.It is better to reduce the features before train model by Random Forest method.* • 3.Per the graphs we generate, they imply something:* – Some error classifications of A are considered as B.* – Some error classifications of B are considered as A or C.* – Some error classifications of C are considered as A.* – Some error classifications of D are considered as A.* – Some error classifications of E are considered as B.* • 4.According to the page http://groupware.les.inf.puc-rio.br/har "Weight Lifting Exercises Dataset". It declares* – Class A - the specification exercise.* – Class B - throwing the elbows to the front* – Class C - lifting the dumbbell only halfway* – Class D - lowering the dumbbell only halfway* – Class E - throwing the hips to the front* • 5.when we do the specified exercise,if we make the mistake about throwing our hips to the front, it might make our elbows to the front at the same time.* • 6.The most important variable is accel_belt_z and then accel_dumbbell_y by GINI importance.*