SlideShare a Scribd company logo
Hierarchical
Clustering
Using Hclust
Clustering: Hierarchical Clustering
#read the data
reviews<-read.csv(file.choose(),stringsAsFactors=FALSE,nrows = 500)
View(reviews)
names(reviews)
reviews1<-data.frame(reviews$reviews.text)
names(reviews1)
dim(reviews1)
names(reviews1)[1]<-"reviews"
#to remove emojis
#reviews1 <- iconv(reviews1, 'UTF-8', 'ASCII')
Rupak Roy
Clustering: Hierarchical Clustering
#Build a Text Corpus
library(tm)
review.corpus<-Corpus(VectorSource(reviews1$reviews))
summary(review.corpus)
inspect(review.corpus[1:5]) #Inspecting elements in Corpus
#it will replace non-convertible bytes in the Corpus with strings showing their hex codes
#Especially the emojis which throws error like invalid input in 'utf8towcs'.
review.corpus<-tm_map(review.corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
#or
#review.corpus <- tm_map(review.corpus, PlainTextDocument)
#or define in the stop words
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
#Data Transformations -Cleaning
#Converting to lower case
review.corpus<-tm_map(review.corpus,tolower)
#Removing extra white space
review.corpus<-tm_map(review.corpus,stripWhitespace)
#Removing punctuations
review.corpus<-tm_map(review.corpus,removePunctuation)
#Removing numbers
review.corpus<-tm_map(review.corpus,removeNumbers)
#Can add more words apart from standard list
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
review.corpus<-tm_map(review.corpus,removeWords,my_stopwords)
Rupak Roy
Clustering: Hierarchical Clustering
#Build term document matrix
review.tdm<-TermDocumentMatrix(review.corpus)
review.tdm
dim(review.tdm) #Dimensions of term document matrix
inspect(review.tdm[1:10,1:10]) #Inspecting the term document matrix
#Removing sparse terms(Words that occur infrequenctly)
#here 97% refers remove at least 97% of sparse
review.imp<-removeSparseTerms(review.tdm,0.97)
review.imp
inspect(review.imp[1:10,1:10])
review.matrix<-as.matrix(review.imp)
#-----------Hclust-----------------------------------
#Measure the distance between the words/terms(as we know in clustering we need
the distance between the data points to group)
distmatrix<-dist(scale(review.matrix),method="euclidean")
#Apply hierarchcal clustering
review.h<-hclust(distmatrix,method="ward.D2")
Rupak Roy
Clustering: Hierarchical Clustering
#plot dendograph which represents the hierarchical structure of clusters
plot(review.h,cex=0.1,hang=-1,main="Cluster Dendogram Plot")
rect.hclust(review.h,5)
library(ggdendro)
ggdendrogram(review.h, rotate = TRUE, size = 3,hang=-1,cex=0.6,theme_dendro = FALSE)
#where hang=-1 to put the labels at the same height
# load code of A2R function
source("E:/data2dim/Text Mining/datasets/Clustering/A2RplotCode.R")
# colored dendrogram
op = par(bg = "#EFEFEF")
A2Rplot(review.h, k = 5, hang = -1,cex=0.5,boxes = FALSE, col.up = "grey50", col.down =
c("green","blue", "black","red","yellow","orange","brown"))
#Triangle plot
p<-as.dendrogram(review.h)
plot(p, type = "triangle", ylab = "Height")
rect.hclust(review.h,5)
# Zoom to the first dendrogram
plot(p, xlim = c(88, 92), ylim = c(1,74))
#ylim = the height
#xlim= is the position values of the labels we can get the values in review.h$labels
review.h$labels
Rupak Roy
Clustering: Hierarchical Clustering
# Change edge color
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
plot(p, xlab = "Height", nodePar = nodePar,
edgePar = list(col = 4:3, lwd = 2:1))
#nodePar: a list of plotting parameters to use for the nodes (see
?points). Default value is NULL. The list may contain components named
pch, cex, col, xpd, and/or bg each of which can have length two for
specifying separate attributes for inner nodes and leaves.
#edgePar: a list of plotting parameters to use for the edge segments
(see ?segments). The list may contain components named col, lty and
lwd (for the segments). As with nodePar, each can have length two for
differentiating leaves and inner nodes.
#leaflab: a string specifying how leaves are labeled. The default
"perpendicular" write text vertically; "textlike" writes text horizontally (in a
rectangle), and "none" suppresses leaf labels.
Rupak Roy
Clustering: Hierarchical Clustering
#Phylogenetic plots can be used to produce a more sophisticated dendrogram.
# install.packages("ape")
library("ape")
# Default plot
plot(as.phylo(review.h))
#Same code mentioned over the clustering chapter of machine learning course
#it will show error as the label values are stored in factor
#crimeHclust1<-crimeHclust
#str(cimeHClust1)
#crimeHclust1$labels<-as.character(crimeHclust1$labels)
plot(as.phylo(review.h), cex = 0.6, label.offset = 0.5)
# Cladogram
plot(as.phylo(review.h), type = "cladogram", cex = 0.6,
label.offset = 0.5)
Rupak Roy
Clustering: Hierarchical Clustering
# Unrooted
plot(as.phylo(review.h), type = "unrooted", cex = 0.6,
no.margin = TRUE)
# Fan
plot(as.phylo(review.h), type = "fan")
# Radial
plot(as.phylo(review.h), type = "radial")
# Group the Fan type into 5 clusters
colors = c("red", "blue", "green", "black")
c = cutree(review.h, 5)
plot(as.phylo(review.h), type = "fan", tip.color = colors[c],
label.offset = 1, cex = 0.7)
Rupak Roy

More Related Content

Similar to Hierarchical Clustering - Text Mining/NLP

R Programming Reference Card
R Programming Reference CardR Programming Reference Card
R Programming Reference Card
Maurice Dawson
 
Mindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developersMindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developers
Keshav Murthy
 
Apache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data modelApache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data model
Andrey Lomakin
 
Introduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkIntroduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with spark
Angelo Leto
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
carliotwaycave
 
Sedna XML Database System: Internal Representation
Sedna XML Database System: Internal RepresentationSedna XML Database System: Internal Representation
Sedna XML Database System: Internal Representation
Ivan Shcheklein
 
Lobos Introduction
Lobos IntroductionLobos Introduction
Lobos Introduction
Nicolas Buduroi
 
R programming
R programmingR programming
R programming
Pramodkumar Jha
 
Text processing by Rj
Text processing by RjText processing by Rj
Python - Lecture 11
Python - Lecture 11Python - Lecture 11
Python - Lecture 11
Ravi Kiran Khareedi
 
Perl Basics with Examples
Perl Basics with ExamplesPerl Basics with Examples
Perl Basics with Examples
Nithin Kumar Singani
 
ADVANCE ITT BY PRASAD
ADVANCE ITT BY PRASADADVANCE ITT BY PRASAD
ADVANCE ITT BY PRASAD
PADYALAMAITHILINATHA
 
Data import-cheatsheet
Data import-cheatsheetData import-cheatsheet
Data import-cheatsheet
Dieudonne Nahigombeye
 
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVSCBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
Gautham Rajesh
 
Python regular expressions
Python regular expressionsPython regular expressions
Python regular expressions
Krishna Nanda
 
Underscore.js
Underscore.jsUnderscore.js
Underscore.jstimourian
 

Similar to Hierarchical Clustering - Text Mining/NLP (20)

R Programming Reference Card
R Programming Reference CardR Programming Reference Card
R Programming Reference Card
 
Python lecture 05
Python lecture 05Python lecture 05
Python lecture 05
 
Mindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developersMindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developers
 
Mysql1
Mysql1Mysql1
Mysql1
 
Apache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data modelApache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data model
 
Introduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkIntroduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with spark
 
Sql server lab_2
Sql server lab_2Sql server lab_2
Sql server lab_2
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
 
Sedna XML Database System: Internal Representation
Sedna XML Database System: Internal RepresentationSedna XML Database System: Internal Representation
Sedna XML Database System: Internal Representation
 
R교육1
R교육1R교육1
R교육1
 
Lobos Introduction
Lobos IntroductionLobos Introduction
Lobos Introduction
 
R programming
R programmingR programming
R programming
 
Text processing by Rj
Text processing by RjText processing by Rj
Text processing by Rj
 
Python - Lecture 11
Python - Lecture 11Python - Lecture 11
Python - Lecture 11
 
Perl Basics with Examples
Perl Basics with ExamplesPerl Basics with Examples
Perl Basics with Examples
 
ADVANCE ITT BY PRASAD
ADVANCE ITT BY PRASADADVANCE ITT BY PRASAD
ADVANCE ITT BY PRASAD
 
Data import-cheatsheet
Data import-cheatsheetData import-cheatsheet
Data import-cheatsheet
 
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVSCBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
 
Python regular expressions
Python regular expressionsPython regular expressions
Python regular expressions
 
Underscore.js
Underscore.jsUnderscore.js
Underscore.js
 

More from Rupak Roy

Clustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLPClustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLP
Rupak Roy
 
Network Analysis - NLP
Network Analysis  - NLPNetwork Analysis  - NLP
Network Analysis - NLP
Rupak Roy
 
Topic Modeling - NLP
Topic Modeling - NLPTopic Modeling - NLP
Topic Modeling - NLP
Rupak Roy
 
Sentiment Analysis Practical Steps
Sentiment Analysis Practical StepsSentiment Analysis Practical Steps
Sentiment Analysis Practical Steps
Rupak Roy
 
NLP - Sentiment Analysis
NLP - Sentiment AnalysisNLP - Sentiment Analysis
NLP - Sentiment Analysis
Rupak Roy
 
Text Mining using Regular Expressions
Text Mining using Regular ExpressionsText Mining using Regular Expressions
Text Mining using Regular Expressions
Rupak Roy
 
Introduction to Text Mining
Introduction to Text Mining Introduction to Text Mining
Introduction to Text Mining
Rupak Roy
 
Apache Hbase Architecture
Apache Hbase ArchitectureApache Hbase Architecture
Apache Hbase Architecture
Rupak Roy
 
Introduction to Hbase
Introduction to Hbase Introduction to Hbase
Introduction to Hbase
Rupak Roy
 
Apache Hive Table Partition and HQL
Apache Hive Table Partition and HQLApache Hive Table Partition and HQL
Apache Hive Table Partition and HQL
Rupak Roy
 
Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export
Rupak Roy
 
Introductive to Hive
Introductive to Hive Introductive to Hive
Introductive to Hive
Rupak Roy
 
Scoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMSScoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMS
Rupak Roy
 
Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode
Rupak Roy
 
Introduction to scoop and its functions
Introduction to scoop and its functionsIntroduction to scoop and its functions
Introduction to scoop and its functions
Rupak Roy
 
Introduction to Flume
Introduction to FlumeIntroduction to Flume
Introduction to Flume
Rupak Roy
 
Apache Pig Relational Operators - II
Apache Pig Relational Operators - II Apache Pig Relational Operators - II
Apache Pig Relational Operators - II
Rupak Roy
 
Passing Parameters using File and Command Line
Passing Parameters using File and Command LinePassing Parameters using File and Command Line
Passing Parameters using File and Command Line
Rupak Roy
 
Apache PIG Relational Operations
Apache PIG Relational Operations Apache PIG Relational Operations
Apache PIG Relational Operations
Rupak Roy
 
Apache PIG casting, reference
Apache PIG casting, referenceApache PIG casting, reference
Apache PIG casting, reference
Rupak Roy
 

More from Rupak Roy (20)

Clustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLPClustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLP
 
Network Analysis - NLP
Network Analysis  - NLPNetwork Analysis  - NLP
Network Analysis - NLP
 
Topic Modeling - NLP
Topic Modeling - NLPTopic Modeling - NLP
Topic Modeling - NLP
 
Sentiment Analysis Practical Steps
Sentiment Analysis Practical StepsSentiment Analysis Practical Steps
Sentiment Analysis Practical Steps
 
NLP - Sentiment Analysis
NLP - Sentiment AnalysisNLP - Sentiment Analysis
NLP - Sentiment Analysis
 
Text Mining using Regular Expressions
Text Mining using Regular ExpressionsText Mining using Regular Expressions
Text Mining using Regular Expressions
 
Introduction to Text Mining
Introduction to Text Mining Introduction to Text Mining
Introduction to Text Mining
 
Apache Hbase Architecture
Apache Hbase ArchitectureApache Hbase Architecture
Apache Hbase Architecture
 
Introduction to Hbase
Introduction to Hbase Introduction to Hbase
Introduction to Hbase
 
Apache Hive Table Partition and HQL
Apache Hive Table Partition and HQLApache Hive Table Partition and HQL
Apache Hive Table Partition and HQL
 
Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export
 
Introductive to Hive
Introductive to Hive Introductive to Hive
Introductive to Hive
 
Scoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMSScoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMS
 
Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode
 
Introduction to scoop and its functions
Introduction to scoop and its functionsIntroduction to scoop and its functions
Introduction to scoop and its functions
 
Introduction to Flume
Introduction to FlumeIntroduction to Flume
Introduction to Flume
 
Apache Pig Relational Operators - II
Apache Pig Relational Operators - II Apache Pig Relational Operators - II
Apache Pig Relational Operators - II
 
Passing Parameters using File and Command Line
Passing Parameters using File and Command LinePassing Parameters using File and Command Line
Passing Parameters using File and Command Line
 
Apache PIG Relational Operations
Apache PIG Relational Operations Apache PIG Relational Operations
Apache PIG Relational Operations
 
Apache PIG casting, reference
Apache PIG casting, referenceApache PIG casting, reference
Apache PIG casting, reference
 

Recently uploaded

一比一原版(NYU毕业证)纽约大学毕业证成绩单
一比一原版(NYU毕业证)纽约大学毕业证成绩单一比一原版(NYU毕业证)纽约大学毕业证成绩单
一比一原版(NYU毕业证)纽约大学毕业证成绩单
ewymefz
 
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
pchutichetpong
 
一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单
ewymefz
 
My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.
rwarrenll
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Subhajit Sahu
 
Empowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptxEmpowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptx
benishzehra469
 
Machine learning and optimization techniques for electrical drives.pptx
Machine learning and optimization techniques for electrical drives.pptxMachine learning and optimization techniques for electrical drives.pptx
Machine learning and optimization techniques for electrical drives.pptx
balafet
 
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
nscud
 
一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单
ocavb
 
一比一原版(QU毕业证)皇后大学毕业证成绩单
一比一原版(QU毕业证)皇后大学毕业证成绩单一比一原版(QU毕业证)皇后大学毕业证成绩单
一比一原版(QU毕业证)皇后大学毕业证成绩单
enxupq
 
一比一原版(UVic毕业证)维多利亚大学毕业证成绩单
一比一原版(UVic毕业证)维多利亚大学毕业证成绩单一比一原版(UVic毕业证)维多利亚大学毕业证成绩单
一比一原版(UVic毕业证)维多利亚大学毕业证成绩单
ukgaet
 
Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...
Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...
Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...
2023240532
 
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
slg6lamcq
 
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
vcaxypu
 
Adjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTESAdjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTES
Subhajit Sahu
 
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
John Andrews
 
【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】
【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】
【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】
NABLAS株式会社
 
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
slg6lamcq
 
06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...
06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...
06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...
Timothy Spann
 
一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理
一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理
一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理
dwreak4tg
 

Recently uploaded (20)

一比一原版(NYU毕业证)纽约大学毕业证成绩单
一比一原版(NYU毕业证)纽约大学毕业证成绩单一比一原版(NYU毕业证)纽约大学毕业证成绩单
一比一原版(NYU毕业证)纽约大学毕业证成绩单
 
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
 
一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单
 
My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.My burning issue is homelessness K.C.M.O.
My burning issue is homelessness K.C.M.O.
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
 
Empowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptxEmpowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptx
 
Machine learning and optimization techniques for electrical drives.pptx
Machine learning and optimization techniques for electrical drives.pptxMachine learning and optimization techniques for electrical drives.pptx
Machine learning and optimization techniques for electrical drives.pptx
 
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
 
一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单
 
一比一原版(QU毕业证)皇后大学毕业证成绩单
一比一原版(QU毕业证)皇后大学毕业证成绩单一比一原版(QU毕业证)皇后大学毕业证成绩单
一比一原版(QU毕业证)皇后大学毕业证成绩单
 
一比一原版(UVic毕业证)维多利亚大学毕业证成绩单
一比一原版(UVic毕业证)维多利亚大学毕业证成绩单一比一原版(UVic毕业证)维多利亚大学毕业证成绩单
一比一原版(UVic毕业证)维多利亚大学毕业证成绩单
 
Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...
Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...
Quantitative Data AnalysisReliability Analysis (Cronbach Alpha) Common Method...
 
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
一比一原版(Adelaide毕业证书)阿德莱德大学毕业证如何办理
 
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
 
Adjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTESAdjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTES
 
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
Chatty Kathy - UNC Bootcamp Final Project Presentation - Final Version - 5.23...
 
【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】
【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】
【社内勉強会資料_Octo: An Open-Source Generalist Robot Policy】
 
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
一比一原版(UniSA毕业证书)南澳大学毕业证如何办理
 
06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...
06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...
06-04-2024 - NYC Tech Week - Discussion on Vector Databases, Unstructured Dat...
 
一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理
一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理
一比一原版(BCU毕业证书)伯明翰城市大学毕业证如何办理
 

Hierarchical Clustering - Text Mining/NLP

  • 2. Clustering: Hierarchical Clustering #read the data reviews<-read.csv(file.choose(),stringsAsFactors=FALSE,nrows = 500) View(reviews) names(reviews) reviews1<-data.frame(reviews$reviews.text) names(reviews1) dim(reviews1) names(reviews1)[1]<-"reviews" #to remove emojis #reviews1 <- iconv(reviews1, 'UTF-8', 'ASCII') Rupak Roy
  • 3. Clustering: Hierarchical Clustering #Build a Text Corpus library(tm) review.corpus<-Corpus(VectorSource(reviews1$reviews)) summary(review.corpus) inspect(review.corpus[1:5]) #Inspecting elements in Corpus #it will replace non-convertible bytes in the Corpus with strings showing their hex codes #Especially the emojis which throws error like invalid input in 'utf8towcs'. review.corpus<-tm_map(review.corpus, function(x) iconv(enc2utf8(x), sub = "byte")) #or #review.corpus <- tm_map(review.corpus, PlainTextDocument) #or define in the stop words my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s') #Data Transformations -Cleaning #Converting to lower case review.corpus<-tm_map(review.corpus,tolower) #Removing extra white space review.corpus<-tm_map(review.corpus,stripWhitespace) #Removing punctuations review.corpus<-tm_map(review.corpus,removePunctuation) #Removing numbers review.corpus<-tm_map(review.corpus,removeNumbers) #Can add more words apart from standard list my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s') review.corpus<-tm_map(review.corpus,removeWords,my_stopwords) Rupak Roy
  • 4. Clustering: Hierarchical Clustering #Build term document matrix review.tdm<-TermDocumentMatrix(review.corpus) review.tdm dim(review.tdm) #Dimensions of term document matrix inspect(review.tdm[1:10,1:10]) #Inspecting the term document matrix #Removing sparse terms(Words that occur infrequenctly) #here 97% refers remove at least 97% of sparse review.imp<-removeSparseTerms(review.tdm,0.97) review.imp inspect(review.imp[1:10,1:10]) review.matrix<-as.matrix(review.imp) #-----------Hclust----------------------------------- #Measure the distance between the words/terms(as we know in clustering we need the distance between the data points to group) distmatrix<-dist(scale(review.matrix),method="euclidean") #Apply hierarchcal clustering review.h<-hclust(distmatrix,method="ward.D2") Rupak Roy
  • 5. Clustering: Hierarchical Clustering #plot dendograph which represents the hierarchical structure of clusters plot(review.h,cex=0.1,hang=-1,main="Cluster Dendogram Plot") rect.hclust(review.h,5) library(ggdendro) ggdendrogram(review.h, rotate = TRUE, size = 3,hang=-1,cex=0.6,theme_dendro = FALSE) #where hang=-1 to put the labels at the same height # load code of A2R function source("E:/data2dim/Text Mining/datasets/Clustering/A2RplotCode.R") # colored dendrogram op = par(bg = "#EFEFEF") A2Rplot(review.h, k = 5, hang = -1,cex=0.5,boxes = FALSE, col.up = "grey50", col.down = c("green","blue", "black","red","yellow","orange","brown")) #Triangle plot p<-as.dendrogram(review.h) plot(p, type = "triangle", ylab = "Height") rect.hclust(review.h,5) # Zoom to the first dendrogram plot(p, xlim = c(88, 92), ylim = c(1,74)) #ylim = the height #xlim= is the position values of the labels we can get the values in review.h$labels review.h$labels Rupak Roy
  • 6. Clustering: Hierarchical Clustering # Change edge color nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), cex = 0.7, col = "blue") plot(p, xlab = "Height", nodePar = nodePar, edgePar = list(col = 4:3, lwd = 2:1)) #nodePar: a list of plotting parameters to use for the nodes (see ?points). Default value is NULL. The list may contain components named pch, cex, col, xpd, and/or bg each of which can have length two for specifying separate attributes for inner nodes and leaves. #edgePar: a list of plotting parameters to use for the edge segments (see ?segments). The list may contain components named col, lty and lwd (for the segments). As with nodePar, each can have length two for differentiating leaves and inner nodes. #leaflab: a string specifying how leaves are labeled. The default "perpendicular" write text vertically; "textlike" writes text horizontally (in a rectangle), and "none" suppresses leaf labels. Rupak Roy
  • 7. Clustering: Hierarchical Clustering #Phylogenetic plots can be used to produce a more sophisticated dendrogram. # install.packages("ape") library("ape") # Default plot plot(as.phylo(review.h)) #Same code mentioned over the clustering chapter of machine learning course #it will show error as the label values are stored in factor #crimeHclust1<-crimeHclust #str(cimeHClust1) #crimeHclust1$labels<-as.character(crimeHclust1$labels) plot(as.phylo(review.h), cex = 0.6, label.offset = 0.5) # Cladogram plot(as.phylo(review.h), type = "cladogram", cex = 0.6, label.offset = 0.5) Rupak Roy
  • 8. Clustering: Hierarchical Clustering # Unrooted plot(as.phylo(review.h), type = "unrooted", cex = 0.6, no.margin = TRUE) # Fan plot(as.phylo(review.h), type = "fan") # Radial plot(as.phylo(review.h), type = "radial") # Group the Fan type into 5 clusters colors = c("red", "blue", "green", "black") c = cutree(review.h, 5) plot(as.phylo(review.h), type = "fan", tip.color = colors[c], label.offset = 1, cex = 0.7) Rupak Roy