SlideShare a Scribd company logo
1 of 8
Download to read offline
Hierarchical
Clustering
Using Hclust
Clustering: Hierarchical Clustering
#read the data
reviews<-read.csv(file.choose(),stringsAsFactors=FALSE,nrows = 500)
View(reviews)
names(reviews)
reviews1<-data.frame(reviews$reviews.text)
names(reviews1)
dim(reviews1)
names(reviews1)[1]<-"reviews"
#to remove emojis
#reviews1 <- iconv(reviews1, 'UTF-8', 'ASCII')
Rupak Roy
Clustering: Hierarchical Clustering
#Build a Text Corpus
library(tm)
review.corpus<-Corpus(VectorSource(reviews1$reviews))
summary(review.corpus)
inspect(review.corpus[1:5]) #Inspecting elements in Corpus
#it will replace non-convertible bytes in the Corpus with strings showing their hex codes
#Especially the emojis which throws error like invalid input in 'utf8towcs'.
review.corpus<-tm_map(review.corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
#or
#review.corpus <- tm_map(review.corpus, PlainTextDocument)
#or define in the stop words
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
#Data Transformations -Cleaning
#Converting to lower case
review.corpus<-tm_map(review.corpus,tolower)
#Removing extra white space
review.corpus<-tm_map(review.corpus,stripWhitespace)
#Removing punctuations
review.corpus<-tm_map(review.corpus,removePunctuation)
#Removing numbers
review.corpus<-tm_map(review.corpus,removeNumbers)
#Can add more words apart from standard list
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
review.corpus<-tm_map(review.corpus,removeWords,my_stopwords)
Rupak Roy
Clustering: Hierarchical Clustering
#Build term document matrix
review.tdm<-TermDocumentMatrix(review.corpus)
review.tdm
dim(review.tdm) #Dimensions of term document matrix
inspect(review.tdm[1:10,1:10]) #Inspecting the term document matrix
#Removing sparse terms(Words that occur infrequenctly)
#here 97% refers remove at least 97% of sparse
review.imp<-removeSparseTerms(review.tdm,0.97)
review.imp
inspect(review.imp[1:10,1:10])
review.matrix<-as.matrix(review.imp)
#-----------Hclust-----------------------------------
#Measure the distance between the words/terms(as we know in clustering we need
the distance between the data points to group)
distmatrix<-dist(scale(review.matrix),method="euclidean")
#Apply hierarchcal clustering
review.h<-hclust(distmatrix,method="ward.D2")
Rupak Roy
Clustering: Hierarchical Clustering
#plot dendograph which represents the hierarchical structure of clusters
plot(review.h,cex=0.1,hang=-1,main="Cluster Dendogram Plot")
rect.hclust(review.h,5)
library(ggdendro)
ggdendrogram(review.h, rotate = TRUE, size = 3,hang=-1,cex=0.6,theme_dendro = FALSE)
#where hang=-1 to put the labels at the same height
# load code of A2R function
source("E:/data2dim/Text Mining/datasets/Clustering/A2RplotCode.R")
# colored dendrogram
op = par(bg = "#EFEFEF")
A2Rplot(review.h, k = 5, hang = -1,cex=0.5,boxes = FALSE, col.up = "grey50", col.down =
c("green","blue", "black","red","yellow","orange","brown"))
#Triangle plot
p<-as.dendrogram(review.h)
plot(p, type = "triangle", ylab = "Height")
rect.hclust(review.h,5)
# Zoom to the first dendrogram
plot(p, xlim = c(88, 92), ylim = c(1,74))
#ylim = the height
#xlim= is the position values of the labels we can get the values in review.h$labels
review.h$labels
Rupak Roy
Clustering: Hierarchical Clustering
# Change edge color
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
plot(p, xlab = "Height", nodePar = nodePar,
edgePar = list(col = 4:3, lwd = 2:1))
#nodePar: a list of plotting parameters to use for the nodes (see
?points). Default value is NULL. The list may contain components named
pch, cex, col, xpd, and/or bg each of which can have length two for
specifying separate attributes for inner nodes and leaves.
#edgePar: a list of plotting parameters to use for the edge segments
(see ?segments). The list may contain components named col, lty and
lwd (for the segments). As with nodePar, each can have length two for
differentiating leaves and inner nodes.
#leaflab: a string specifying how leaves are labeled. The default
"perpendicular" write text vertically; "textlike" writes text horizontally (in a
rectangle), and "none" suppresses leaf labels.
Rupak Roy
Clustering: Hierarchical Clustering
#Phylogenetic plots can be used to produce a more sophisticated dendrogram.
# install.packages("ape")
library("ape")
# Default plot
plot(as.phylo(review.h))
#Same code mentioned over the clustering chapter of machine learning course
#it will show error as the label values are stored in factor
#crimeHclust1<-crimeHclust
#str(cimeHClust1)
#crimeHclust1$labels<-as.character(crimeHclust1$labels)
plot(as.phylo(review.h), cex = 0.6, label.offset = 0.5)
# Cladogram
plot(as.phylo(review.h), type = "cladogram", cex = 0.6,
label.offset = 0.5)
Rupak Roy
Clustering: Hierarchical Clustering
# Unrooted
plot(as.phylo(review.h), type = "unrooted", cex = 0.6,
no.margin = TRUE)
# Fan
plot(as.phylo(review.h), type = "fan")
# Radial
plot(as.phylo(review.h), type = "radial")
# Group the Fan type into 5 clusters
colors = c("red", "blue", "green", "black")
c = cutree(review.h, 5)
plot(as.phylo(review.h), type = "fan", tip.color = colors[c],
label.offset = 1, cex = 0.7)
Rupak Roy

More Related Content

Similar to Hierarchical Clustering - Text Mining/NLP

R Programming Reference Card
R Programming Reference CardR Programming Reference Card
R Programming Reference CardMaurice Dawson
 
Mindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developersMindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developersKeshav Murthy
 
Apache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data modelApache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data modelAndrey Lomakin
 
Introduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkIntroduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkAngelo Leto
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxcarliotwaycave
 
Sedna XML Database System: Internal Representation
Sedna XML Database System: Internal RepresentationSedna XML Database System: Internal Representation
Sedna XML Database System: Internal RepresentationIvan Shcheklein
 
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVSCBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVSGautham Rajesh
 
Python regular expressions
Python regular expressionsPython regular expressions
Python regular expressionsKrishna Nanda
 
Underscore.js
Underscore.jsUnderscore.js
Underscore.jstimourian
 

Similar to Hierarchical Clustering - Text Mining/NLP (20)

R Programming Reference Card
R Programming Reference CardR Programming Reference Card
R Programming Reference Card
 
Python lecture 05
Python lecture 05Python lecture 05
Python lecture 05
 
Mindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developersMindmap: Oracle to Couchbase for developers
Mindmap: Oracle to Couchbase for developers
 
Mysql1
Mysql1Mysql1
Mysql1
 
Apache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data modelApache Cassandra, part 1 – principles, data model
Apache Cassandra, part 1 – principles, data model
 
Introduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkIntroduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with spark
 
Sql server lab_2
Sql server lab_2Sql server lab_2
Sql server lab_2
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
 
Sedna XML Database System: Internal Representation
Sedna XML Database System: Internal RepresentationSedna XML Database System: Internal Representation
Sedna XML Database System: Internal Representation
 
R교육1
R교육1R교육1
R교육1
 
Lobos Introduction
Lobos IntroductionLobos Introduction
Lobos Introduction
 
R programming
R programmingR programming
R programming
 
Text processing by Rj
Text processing by RjText processing by Rj
Text processing by Rj
 
Python - Lecture 11
Python - Lecture 11Python - Lecture 11
Python - Lecture 11
 
Perl Basics with Examples
Perl Basics with ExamplesPerl Basics with Examples
Perl Basics with Examples
 
ADVANCE ITT BY PRASAD
ADVANCE ITT BY PRASADADVANCE ITT BY PRASAD
ADVANCE ITT BY PRASAD
 
Data import-cheatsheet
Data import-cheatsheetData import-cheatsheet
Data import-cheatsheet
 
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVSCBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
CBSE XII COMPUTER SCIENCE STUDY MATERIAL BY KVS
 
Python regular expressions
Python regular expressionsPython regular expressions
Python regular expressions
 
Underscore.js
Underscore.jsUnderscore.js
Underscore.js
 

More from Rupak Roy

Clustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLPClustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLPRupak Roy
 
Network Analysis - NLP
Network Analysis  - NLPNetwork Analysis  - NLP
Network Analysis - NLPRupak Roy
 
Topic Modeling - NLP
Topic Modeling - NLPTopic Modeling - NLP
Topic Modeling - NLPRupak Roy
 
Sentiment Analysis Practical Steps
Sentiment Analysis Practical StepsSentiment Analysis Practical Steps
Sentiment Analysis Practical StepsRupak Roy
 
NLP - Sentiment Analysis
NLP - Sentiment AnalysisNLP - Sentiment Analysis
NLP - Sentiment AnalysisRupak Roy
 
Text Mining using Regular Expressions
Text Mining using Regular ExpressionsText Mining using Regular Expressions
Text Mining using Regular ExpressionsRupak Roy
 
Introduction to Text Mining
Introduction to Text Mining Introduction to Text Mining
Introduction to Text Mining Rupak Roy
 
Apache Hbase Architecture
Apache Hbase ArchitectureApache Hbase Architecture
Apache Hbase ArchitectureRupak Roy
 
Introduction to Hbase
Introduction to Hbase Introduction to Hbase
Introduction to Hbase Rupak Roy
 
Apache Hive Table Partition and HQL
Apache Hive Table Partition and HQLApache Hive Table Partition and HQL
Apache Hive Table Partition and HQLRupak Roy
 
Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export Rupak Roy
 
Introductive to Hive
Introductive to Hive Introductive to Hive
Introductive to Hive Rupak Roy
 
Scoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMSScoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMSRupak Roy
 
Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode Rupak Roy
 
Introduction to scoop and its functions
Introduction to scoop and its functionsIntroduction to scoop and its functions
Introduction to scoop and its functionsRupak Roy
 
Introduction to Flume
Introduction to FlumeIntroduction to Flume
Introduction to FlumeRupak Roy
 
Apache Pig Relational Operators - II
Apache Pig Relational Operators - II Apache Pig Relational Operators - II
Apache Pig Relational Operators - II Rupak Roy
 
Passing Parameters using File and Command Line
Passing Parameters using File and Command LinePassing Parameters using File and Command Line
Passing Parameters using File and Command LineRupak Roy
 
Apache PIG Relational Operations
Apache PIG Relational Operations Apache PIG Relational Operations
Apache PIG Relational Operations Rupak Roy
 
Apache PIG casting, reference
Apache PIG casting, referenceApache PIG casting, reference
Apache PIG casting, referenceRupak Roy
 

More from Rupak Roy (20)

Clustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLPClustering K means and Hierarchical - NLP
Clustering K means and Hierarchical - NLP
 
Network Analysis - NLP
Network Analysis  - NLPNetwork Analysis  - NLP
Network Analysis - NLP
 
Topic Modeling - NLP
Topic Modeling - NLPTopic Modeling - NLP
Topic Modeling - NLP
 
Sentiment Analysis Practical Steps
Sentiment Analysis Practical StepsSentiment Analysis Practical Steps
Sentiment Analysis Practical Steps
 
NLP - Sentiment Analysis
NLP - Sentiment AnalysisNLP - Sentiment Analysis
NLP - Sentiment Analysis
 
Text Mining using Regular Expressions
Text Mining using Regular ExpressionsText Mining using Regular Expressions
Text Mining using Regular Expressions
 
Introduction to Text Mining
Introduction to Text Mining Introduction to Text Mining
Introduction to Text Mining
 
Apache Hbase Architecture
Apache Hbase ArchitectureApache Hbase Architecture
Apache Hbase Architecture
 
Introduction to Hbase
Introduction to Hbase Introduction to Hbase
Introduction to Hbase
 
Apache Hive Table Partition and HQL
Apache Hive Table Partition and HQLApache Hive Table Partition and HQL
Apache Hive Table Partition and HQL
 
Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export Installing Apache Hive, internal and external table, import-export
Installing Apache Hive, internal and external table, import-export
 
Introductive to Hive
Introductive to Hive Introductive to Hive
Introductive to Hive
 
Scoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMSScoop Job, import and export to RDBMS
Scoop Job, import and export to RDBMS
 
Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode Apache Scoop - Import with Append mode and Last Modified mode
Apache Scoop - Import with Append mode and Last Modified mode
 
Introduction to scoop and its functions
Introduction to scoop and its functionsIntroduction to scoop and its functions
Introduction to scoop and its functions
 
Introduction to Flume
Introduction to FlumeIntroduction to Flume
Introduction to Flume
 
Apache Pig Relational Operators - II
Apache Pig Relational Operators - II Apache Pig Relational Operators - II
Apache Pig Relational Operators - II
 
Passing Parameters using File and Command Line
Passing Parameters using File and Command LinePassing Parameters using File and Command Line
Passing Parameters using File and Command Line
 
Apache PIG Relational Operations
Apache PIG Relational Operations Apache PIG Relational Operations
Apache PIG Relational Operations
 
Apache PIG casting, reference
Apache PIG casting, referenceApache PIG casting, reference
Apache PIG casting, reference
 

Recently uploaded

VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130
VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130
VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130Suhani Kapoor
 
{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...Pooja Nehwal
 
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptxEMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptxthyngster
 
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Serviceranjana rawat
 
Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝
Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝
Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝soniya singh
 
Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)
Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)
Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)jennyeacort
 
GA4 Without Cookies [Measure Camp AMS]
GA4 Without Cookies [Measure Camp AMS]GA4 Without Cookies [Measure Camp AMS]
GA4 Without Cookies [Measure Camp AMS]📊 Markus Baersch
 
1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样
1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样
1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样vhwb25kk
 
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /WhatsappsBeautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsappssapnasaifi408
 
PKS-TGC-1084-630 - Stage 1 Proposal.pptx
PKS-TGC-1084-630 - Stage 1 Proposal.pptxPKS-TGC-1084-630 - Stage 1 Proposal.pptx
PKS-TGC-1084-630 - Stage 1 Proposal.pptxPramod Kumar Srivastava
 
Industrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdfIndustrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdfLars Albertsson
 
dokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.ppt
dokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.pptdokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.ppt
dokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.pptSonatrach
 
Schema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfSchema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfLars Albertsson
 
Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024
Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024
Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024thyngster
 
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...Sapana Sha
 
Amazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptx
Amazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptxAmazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptx
Amazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptxAbdelrhman abooda
 
INTERNSHIP ON PURBASHA COMPOSITE TEX LTD
INTERNSHIP ON PURBASHA COMPOSITE TEX LTDINTERNSHIP ON PURBASHA COMPOSITE TEX LTD
INTERNSHIP ON PURBASHA COMPOSITE TEX LTDRafezzaman
 
RadioAdProWritingCinderellabyButleri.pdf
RadioAdProWritingCinderellabyButleri.pdfRadioAdProWritingCinderellabyButleri.pdf
RadioAdProWritingCinderellabyButleri.pdfgstagge
 

Recently uploaded (20)

VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130
VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130
VIP Call Girls Service Miyapur Hyderabad Call +91-8250192130
 
{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
 
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptxEMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptx
 
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
 
Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝
Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝
Call Girls in Defence Colony Delhi 💯Call Us 🔝8264348440🔝
 
Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)
Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)
Call Us ➥97111√47426🤳Call Girls in Aerocity (Delhi NCR)
 
GA4 Without Cookies [Measure Camp AMS]
GA4 Without Cookies [Measure Camp AMS]GA4 Without Cookies [Measure Camp AMS]
GA4 Without Cookies [Measure Camp AMS]
 
1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样
1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样
1:1定制(UQ毕业证)昆士兰大学毕业证成绩单修改留信学历认证原版一模一样
 
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /WhatsappsBeautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsapps
 
Deep Generative Learning for All - The Gen AI Hype (Spring 2024)
Deep Generative Learning for All - The Gen AI Hype (Spring 2024)Deep Generative Learning for All - The Gen AI Hype (Spring 2024)
Deep Generative Learning for All - The Gen AI Hype (Spring 2024)
 
PKS-TGC-1084-630 - Stage 1 Proposal.pptx
PKS-TGC-1084-630 - Stage 1 Proposal.pptxPKS-TGC-1084-630 - Stage 1 Proposal.pptx
PKS-TGC-1084-630 - Stage 1 Proposal.pptx
 
Decoding Loan Approval: Predictive Modeling in Action
Decoding Loan Approval: Predictive Modeling in ActionDecoding Loan Approval: Predictive Modeling in Action
Decoding Loan Approval: Predictive Modeling in Action
 
Industrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdfIndustrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdf
 
dokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.ppt
dokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.pptdokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.ppt
dokumen.tips_chapter-4-transient-heat-conduction-mehmet-kanoglu.ppt
 
Schema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfSchema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdf
 
Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024
Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024
Consent & Privacy Signals on Google *Pixels* - MeasureCamp Amsterdam 2024
 
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
 
Amazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptx
Amazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptxAmazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptx
Amazon TQM (2) Amazon TQM (2)Amazon TQM (2).pptx
 
INTERNSHIP ON PURBASHA COMPOSITE TEX LTD
INTERNSHIP ON PURBASHA COMPOSITE TEX LTDINTERNSHIP ON PURBASHA COMPOSITE TEX LTD
INTERNSHIP ON PURBASHA COMPOSITE TEX LTD
 
RadioAdProWritingCinderellabyButleri.pdf
RadioAdProWritingCinderellabyButleri.pdfRadioAdProWritingCinderellabyButleri.pdf
RadioAdProWritingCinderellabyButleri.pdf
 

Hierarchical Clustering - Text Mining/NLP

  • 2. Clustering: Hierarchical Clustering #read the data reviews<-read.csv(file.choose(),stringsAsFactors=FALSE,nrows = 500) View(reviews) names(reviews) reviews1<-data.frame(reviews$reviews.text) names(reviews1) dim(reviews1) names(reviews1)[1]<-"reviews" #to remove emojis #reviews1 <- iconv(reviews1, 'UTF-8', 'ASCII') Rupak Roy
  • 3. Clustering: Hierarchical Clustering #Build a Text Corpus library(tm) review.corpus<-Corpus(VectorSource(reviews1$reviews)) summary(review.corpus) inspect(review.corpus[1:5]) #Inspecting elements in Corpus #it will replace non-convertible bytes in the Corpus with strings showing their hex codes #Especially the emojis which throws error like invalid input in 'utf8towcs'. review.corpus<-tm_map(review.corpus, function(x) iconv(enc2utf8(x), sub = "byte")) #or #review.corpus <- tm_map(review.corpus, PlainTextDocument) #or define in the stop words my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s') #Data Transformations -Cleaning #Converting to lower case review.corpus<-tm_map(review.corpus,tolower) #Removing extra white space review.corpus<-tm_map(review.corpus,stripWhitespace) #Removing punctuations review.corpus<-tm_map(review.corpus,removePunctuation) #Removing numbers review.corpus<-tm_map(review.corpus,removeNumbers) #Can add more words apart from standard list my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s') review.corpus<-tm_map(review.corpus,removeWords,my_stopwords) Rupak Roy
  • 4. Clustering: Hierarchical Clustering #Build term document matrix review.tdm<-TermDocumentMatrix(review.corpus) review.tdm dim(review.tdm) #Dimensions of term document matrix inspect(review.tdm[1:10,1:10]) #Inspecting the term document matrix #Removing sparse terms(Words that occur infrequenctly) #here 97% refers remove at least 97% of sparse review.imp<-removeSparseTerms(review.tdm,0.97) review.imp inspect(review.imp[1:10,1:10]) review.matrix<-as.matrix(review.imp) #-----------Hclust----------------------------------- #Measure the distance between the words/terms(as we know in clustering we need the distance between the data points to group) distmatrix<-dist(scale(review.matrix),method="euclidean") #Apply hierarchcal clustering review.h<-hclust(distmatrix,method="ward.D2") Rupak Roy
  • 5. Clustering: Hierarchical Clustering #plot dendograph which represents the hierarchical structure of clusters plot(review.h,cex=0.1,hang=-1,main="Cluster Dendogram Plot") rect.hclust(review.h,5) library(ggdendro) ggdendrogram(review.h, rotate = TRUE, size = 3,hang=-1,cex=0.6,theme_dendro = FALSE) #where hang=-1 to put the labels at the same height # load code of A2R function source("E:/data2dim/Text Mining/datasets/Clustering/A2RplotCode.R") # colored dendrogram op = par(bg = "#EFEFEF") A2Rplot(review.h, k = 5, hang = -1,cex=0.5,boxes = FALSE, col.up = "grey50", col.down = c("green","blue", "black","red","yellow","orange","brown")) #Triangle plot p<-as.dendrogram(review.h) plot(p, type = "triangle", ylab = "Height") rect.hclust(review.h,5) # Zoom to the first dendrogram plot(p, xlim = c(88, 92), ylim = c(1,74)) #ylim = the height #xlim= is the position values of the labels we can get the values in review.h$labels review.h$labels Rupak Roy
  • 6. Clustering: Hierarchical Clustering # Change edge color nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), cex = 0.7, col = "blue") plot(p, xlab = "Height", nodePar = nodePar, edgePar = list(col = 4:3, lwd = 2:1)) #nodePar: a list of plotting parameters to use for the nodes (see ?points). Default value is NULL. The list may contain components named pch, cex, col, xpd, and/or bg each of which can have length two for specifying separate attributes for inner nodes and leaves. #edgePar: a list of plotting parameters to use for the edge segments (see ?segments). The list may contain components named col, lty and lwd (for the segments). As with nodePar, each can have length two for differentiating leaves and inner nodes. #leaflab: a string specifying how leaves are labeled. The default "perpendicular" write text vertically; "textlike" writes text horizontally (in a rectangle), and "none" suppresses leaf labels. Rupak Roy
  • 7. Clustering: Hierarchical Clustering #Phylogenetic plots can be used to produce a more sophisticated dendrogram. # install.packages("ape") library("ape") # Default plot plot(as.phylo(review.h)) #Same code mentioned over the clustering chapter of machine learning course #it will show error as the label values are stored in factor #crimeHclust1<-crimeHclust #str(cimeHClust1) #crimeHclust1$labels<-as.character(crimeHclust1$labels) plot(as.phylo(review.h), cex = 0.6, label.offset = 0.5) # Cladogram plot(as.phylo(review.h), type = "cladogram", cex = 0.6, label.offset = 0.5) Rupak Roy
  • 8. Clustering: Hierarchical Clustering # Unrooted plot(as.phylo(review.h), type = "unrooted", cex = 0.6, no.margin = TRUE) # Fan plot(as.phylo(review.h), type = "fan") # Radial plot(as.phylo(review.h), type = "radial") # Group the Fan type into 5 clusters colors = c("red", "blue", "green", "black") c = cutree(review.h, 5) plot(as.phylo(review.h), type = "fan", tip.color = colors[c], label.offset = 1, cex = 0.7) Rupak Roy