Documented Hierarchical clustering using Hclust for text mining, natural language processing.
Thanks, for your time, if you enjoyed this short article there are tons of topics in advanced analytics, data science, and machine learning available in my medium repo. https://medium.com/@bobrupakroy
2. Clustering: Hierarchical Clustering
#read the data
reviews<-read.csv(file.choose(),stringsAsFactors=FALSE,nrows = 500)
View(reviews)
names(reviews)
reviews1<-data.frame(reviews$reviews.text)
names(reviews1)
dim(reviews1)
names(reviews1)[1]<-"reviews"
#to remove emojis
#reviews1 <- iconv(reviews1, 'UTF-8', 'ASCII')
Rupak Roy
3. Clustering: Hierarchical Clustering
#Build a Text Corpus
library(tm)
review.corpus<-Corpus(VectorSource(reviews1$reviews))
summary(review.corpus)
inspect(review.corpus[1:5]) #Inspecting elements in Corpus
#it will replace non-convertible bytes in the Corpus with strings showing their hex codes
#Especially the emojis which throws error like invalid input in 'utf8towcs'.
review.corpus<-tm_map(review.corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
#or
#review.corpus <- tm_map(review.corpus, PlainTextDocument)
#or define in the stop words
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
#Data Transformations -Cleaning
#Converting to lower case
review.corpus<-tm_map(review.corpus,tolower)
#Removing extra white space
review.corpus<-tm_map(review.corpus,stripWhitespace)
#Removing punctuations
review.corpus<-tm_map(review.corpus,removePunctuation)
#Removing numbers
review.corpus<-tm_map(review.corpus,removeNumbers)
#Can add more words apart from standard list
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
review.corpus<-tm_map(review.corpus,removeWords,my_stopwords)
Rupak Roy
4. Clustering: Hierarchical Clustering
#Build term document matrix
review.tdm<-TermDocumentMatrix(review.corpus)
review.tdm
dim(review.tdm) #Dimensions of term document matrix
inspect(review.tdm[1:10,1:10]) #Inspecting the term document matrix
#Removing sparse terms(Words that occur infrequenctly)
#here 97% refers remove at least 97% of sparse
review.imp<-removeSparseTerms(review.tdm,0.97)
review.imp
inspect(review.imp[1:10,1:10])
review.matrix<-as.matrix(review.imp)
#-----------Hclust-----------------------------------
#Measure the distance between the words/terms(as we know in clustering we need
the distance between the data points to group)
distmatrix<-dist(scale(review.matrix),method="euclidean")
#Apply hierarchcal clustering
review.h<-hclust(distmatrix,method="ward.D2")
Rupak Roy
5. Clustering: Hierarchical Clustering
#plot dendograph which represents the hierarchical structure of clusters
plot(review.h,cex=0.1,hang=-1,main="Cluster Dendogram Plot")
rect.hclust(review.h,5)
library(ggdendro)
ggdendrogram(review.h, rotate = TRUE, size = 3,hang=-1,cex=0.6,theme_dendro = FALSE)
#where hang=-1 to put the labels at the same height
# load code of A2R function
source("E:/data2dim/Text Mining/datasets/Clustering/A2RplotCode.R")
# colored dendrogram
op = par(bg = "#EFEFEF")
A2Rplot(review.h, k = 5, hang = -1,cex=0.5,boxes = FALSE, col.up = "grey50", col.down =
c("green","blue", "black","red","yellow","orange","brown"))
#Triangle plot
p<-as.dendrogram(review.h)
plot(p, type = "triangle", ylab = "Height")
rect.hclust(review.h,5)
# Zoom to the first dendrogram
plot(p, xlim = c(88, 92), ylim = c(1,74))
#ylim = the height
#xlim= is the position values of the labels we can get the values in review.h$labels
review.h$labels
Rupak Roy
6. Clustering: Hierarchical Clustering
# Change edge color
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
plot(p, xlab = "Height", nodePar = nodePar,
edgePar = list(col = 4:3, lwd = 2:1))
#nodePar: a list of plotting parameters to use for the nodes (see
?points). Default value is NULL. The list may contain components named
pch, cex, col, xpd, and/or bg each of which can have length two for
specifying separate attributes for inner nodes and leaves.
#edgePar: a list of plotting parameters to use for the edge segments
(see ?segments). The list may contain components named col, lty and
lwd (for the segments). As with nodePar, each can have length two for
differentiating leaves and inner nodes.
#leaflab: a string specifying how leaves are labeled. The default
"perpendicular" write text vertically; "textlike" writes text horizontally (in a
rectangle), and "none" suppresses leaf labels.
Rupak Roy
7. Clustering: Hierarchical Clustering
#Phylogenetic plots can be used to produce a more sophisticated dendrogram.
# install.packages("ape")
library("ape")
# Default plot
plot(as.phylo(review.h))
#Same code mentioned over the clustering chapter of machine learning course
#it will show error as the label values are stored in factor
#crimeHclust1<-crimeHclust
#str(cimeHClust1)
#crimeHclust1$labels<-as.character(crimeHclust1$labels)
plot(as.phylo(review.h), cex = 0.6, label.offset = 0.5)
# Cladogram
plot(as.phylo(review.h), type = "cladogram", cex = 0.6,
label.offset = 0.5)
Rupak Roy
8. Clustering: Hierarchical Clustering
# Unrooted
plot(as.phylo(review.h), type = "unrooted", cex = 0.6,
no.margin = TRUE)
# Fan
plot(as.phylo(review.h), type = "fan")
# Radial
plot(as.phylo(review.h), type = "radial")
# Group the Fan type into 5 clusters
colors = c("red", "blue", "green", "black")
c = cutree(review.h, 5)
plot(as.phylo(review.h), type = "fan", tip.color = colors[c],
label.offset = 1, cex = 0.7)
Rupak Roy