SlideShare a Scribd company logo
1 of 22
Download to read offline
Basic NLP with Python
and NLTK
Bruni Francesco (@brunifrancesco)
Download the original iPython notebook @
https://github.com/brunifrancesco/nltk_base.git
Python
- Programming language
- Multi-paradigm
- Easy to learn
- Suitable for multiple needs
- Multiple implementations, a ton of useful libraries
Basic Python
import random
a_number = 1
a_string = "Python rocks!"
a_list = ["1", "2", "3"]
a_dict = {"film":"Pulp fiction", "francesco": "Python"}
print(a_dict.values())
a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1}
print(len(a_dict_of_list["key"]))
a_tuple = ("Goodfellas", "Kill Bill",)
a_list.append(4)
Creating functions
def super_function(number):
return number * 2
def factorial(n):
if n == 0: return 1
else: return n*factorial(n-1)
double = lambda item: item * 2
predicate = lambda item: item > 3
assert super_function(3) == 6
assert factorial(3) == 6
assert double(3) == 6
assert list(filter(predicate, [1,2,5,3])) == [5]
And much more
- Object oriented paradigm --> classes, metaclasses etc. etc.
- Functional programming paradigm --> partials, closures, high
order functions etc. etc.
- Scripting paradigm --> shell control, os related functions etc..
- Async ops support --> asyncio
Reading files
with open("file", "r") as input:
data = input.read()
import csv
def read_csv():
with open('data.csv', 'r') as francesco:
data = csv.reader(francesco, delimiter=';')
for element in data:
print(element[1])
read_csv()
Make data talk
from collections import Counter
import statistics
splitted_chunks = data.split()
print("Data lenght: %s" %len(data))
print("Chunks numbers: %s" %len(splitted_chunks))
print("Unique chunks: %s" %len(set(splitted_chunks)))
print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks)))
print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks)))
print("Frequency distribution: %s" %
sorted(filter(lambda item: item[1] > 5,
Counter(splitted_chunks).items()), key=lambda item: item[1]))
NLTK
- tokenization
- stemming
- tagging
- parsing
- semantic reasoning
- classification
Tokenizing
from nltk import word_tokenize
tokens = word_tokenize(data)
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True)
s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
tw_tokens = tokenizer.tokenize(s1)
print(tw_tokens)
Frequency distribution
from nltk.book import FreqDist
fdist1 = FreqDist(splitted_chunks)
most_common = fdist1.most_common(50)
fdist1.plot(50, cumulative=True)
fdist1.plot(10)
print("Max frequency key: %s" %fdist1.max())
print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"])
print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
Cleaning data
from nltk.corpus import stopwords
def remove_stopword(word):
return word not in words
import string
words = stopwords.words('italian')
lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks))
print("Chunks lenght %s" %len(lowered_chunks))
clean_chunks = list(filter(remove_stopword, splitted_chunks))
print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks))
clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks))
print("Cleaned chunks (without punctuation and stopwords) lenght: %s"
%len(clean_chunks))
from nltk.book import FreqDist
fdist1 = FreqDist(clean_chunks)
most_common = fdist1.most_common(50)
Stemming
from nltk.stem.porter import *
from nltk.stem.snowball import *
stemmer = PorterStemmer()
stemmer.stem(“activities")
available_langs = SnowballStemmer.languages
sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True)
print(sn_stemmer.stem("ordenador"))
from nltk.stem.lancaster import *
LancasterStemmer().stem("activities")
Custom ngrams finder
def find_and_analyze_ngrams(tagged_sent):
chunker = RegexpParser(CHUNK_RULE)
tree = chunker.parse(tagged_sent)
for item in self.__leaves(tree):
if not item == tagged_sent:
probable_ngram = ' '.join(self.__stemmer.stem(
word.lower()) for (word, pos) in item
)
if self.__evaluate_polarity_ngram(probable_ngram):
yield probable_ngram
Classifying data
def __get_elements_for_classification(self, lfeats, train_number, classifying=True):
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
if classifying:
train_feats.extend([(feat, label) for feat in feats])
else:
cutoff = train_number * len(feats)/10
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
nb_classifier = NaiveBayesClassifier.train(train_feats)
return train_feats, test_feats, nb_classifier
Pointwise Mutual
Information
PMI(X = x, Y = y) = log
p(X = x, Y = y)
p(X = x)p(Y = y)
Measure PMI
- Read from csv
- Preprocess data (tokenize, lower, remove stopwords, punctuation)
- Find frequency distribution for unigrams
- Find frequency distribution for bigrams
- Compute PMI via implemented function
- Let NLTK sort bigrams by PMI metric
- Write result to CSV file
Read data
import nltk
from nltk.corpus import stopwords
import string
import random
from itertools import chain
import math
import csv
import time
def read_data():
"""
Read data 'libe by line'"""
with open('data.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
yield row
Preprocess
def preprocess(data):
"""
Preprocess data, filtering out stopwords, punctuation and lowering
all splitted tokens
:param data: the string data to be processed
"""
italian_stopwords = stopwords.words('italian')
splitted_chunks = data.split()
lowered_chunks = (item.lower() for item in splitted_chunks)
chunks_without_punctuation = (chunk for chunk in lowered_chunks
if chunk not in string.punctuation)
chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation
if chunk not in italian_stopwords)
return list(chunks_without_stopwords)
Find N-Grams
FREQUENCY_TRESHOLD = 2
def find_bigrams(splitted_chunks):
"""
Find bigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks)
bigrams.apply_freq_filter(FREQUENCY_TRESHOLD)
return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()}
def find_unigrams(splitted_chunks):
"""
Find unigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
unigrams = nltk.FreqDist(splitted_chunks)
return {unigram: freq for unigram, freq in unigrams.items()
if freq > FREQUENCY_TRESHOLD - 1}
Compute PMI
def pmi(word1, word2, unigram_freq, bigram_freq):
"""
Find PMI measure
:param word1: the first word
:param word2: the second word
:param unigram_freq: the unigram frequency container
:param bigram_freq: the bigram frequency container
"""
prob_word1 = unigram_freq[word1] / sum(unigram_freq.values())
prob_word2 = unigram_freq[word2] / sum(unigram_freq.values())
prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values())
a = prob_word1_word2/prob_word1*prob_word2
return round(math.log(a,2),2)
Write result to CSV
def write_data(result):
"""
Write result to CSV file
:param result: the list to be written to csv file
"""
with open("result.csv", "a") as output:
writer = csv.writer(output, delimiter='*')
for row in result:
writer.writerow(row)
Happy coding :)

More Related Content

What's hot

Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary dataKernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Anne Nicolas
 
Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014
Fasihul Kabir
 
Airlover 20030324 1
Airlover 20030324 1Airlover 20030324 1
Airlover 20030324 1
Dr.Ravi
 
Using Unix
Using UnixUsing Unix
Using Unix
Dr.Ravi
 
Talk Unix Shell Script
Talk Unix Shell ScriptTalk Unix Shell Script
Talk Unix Shell Script
Dr.Ravi
 

What's hot (20)

Python in 90mins
Python in 90minsPython in 90mins
Python in 90mins
 
Python for Penetration testers
Python for Penetration testersPython for Penetration testers
Python for Penetration testers
 
Procesamiento del lenguaje natural con python
Procesamiento del lenguaje natural con pythonProcesamiento del lenguaje natural con python
Procesamiento del lenguaje natural con python
 
Good Code
Good CodeGood Code
Good Code
 
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary dataKernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
 
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RIThe Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
 
Snakes for Camels
Snakes for CamelsSnakes for Camels
Snakes for Camels
 
Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014
 
Whispered secrets
Whispered secretsWhispered secrets
Whispered secrets
 
Airlover 20030324 1
Airlover 20030324 1Airlover 20030324 1
Airlover 20030324 1
 
Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 Minutes
 
Introduction to advanced python
Introduction to advanced pythonIntroduction to advanced python
Introduction to advanced python
 
Introduction to Python
Introduction to PythonIntroduction to Python
Introduction to Python
 
Twitter Author Prediction from Tweets using Bayesian Network
Twitter Author Prediction from Tweets using Bayesian NetworkTwitter Author Prediction from Tweets using Bayesian Network
Twitter Author Prediction from Tweets using Bayesian Network
 
Implementing Software Machines in Go and C
Implementing Software Machines in Go and CImplementing Software Machines in Go and C
Implementing Software Machines in Go and C
 
Using Unix
Using UnixUsing Unix
Using Unix
 
Encrypt all transports
Encrypt all transportsEncrypt all transports
Encrypt all transports
 
The Ring programming language version 1.7 book - Part 43 of 196
The Ring programming language version 1.7 book - Part 43 of 196The Ring programming language version 1.7 book - Part 43 of 196
The Ring programming language version 1.7 book - Part 43 of 196
 
Talk Unix Shell Script
Talk Unix Shell ScriptTalk Unix Shell Script
Talk Unix Shell Script
 

Viewers also liked

Practical Natural Language Processing
Practical Natural Language ProcessingPractical Natural Language Processing
Practical Natural Language Processing
Jaganadh Gopinadhan
 
Sentiment analysis-by-nltk
Sentiment analysis-by-nltkSentiment analysis-by-nltk
Sentiment analysis-by-nltk
Wei-Ting Kuo
 
The Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text AnalyticsThe Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text Analytics
Alyona Medelyan
 

Viewers also liked (20)

NLTK - Natural Language Processing in Python
NLTK - Natural Language Processing in PythonNLTK - Natural Language Processing in Python
NLTK - Natural Language Processing in Python
 
Practical Natural Language Processing
Practical Natural Language ProcessingPractical Natural Language Processing
Practical Natural Language Processing
 
Continuous Integration/Deployment with Docker and Jenkins
Continuous Integration/Deployment with Docker and JenkinsContinuous Integration/Deployment with Docker and Jenkins
Continuous Integration/Deployment with Docker and Jenkins
 
Sentiment analysis-by-nltk
Sentiment analysis-by-nltkSentiment analysis-by-nltk
Sentiment analysis-by-nltk
 
Natural Language Processing with Python
Natural Language Processing with PythonNatural Language Processing with Python
Natural Language Processing with Python
 
Natural Language Processing
Natural Language ProcessingNatural Language Processing
Natural Language Processing
 
Open street map
Open street mapOpen street map
Open street map
 
ZOETWITT in the Press
ZOETWITT in the PressZOETWITT in the Press
ZOETWITT in the Press
 
Python & Stuff
Python & StuffPython & Stuff
Python & Stuff
 
Nd4 j slides.pptx
Nd4 j slides.pptxNd4 j slides.pptx
Nd4 j slides.pptx
 
Introduction to Functional Programming
Introduction to Functional ProgrammingIntroduction to Functional Programming
Introduction to Functional Programming
 
Corpus Bootstrapping with NLTK
Corpus Bootstrapping with NLTKCorpus Bootstrapping with NLTK
Corpus Bootstrapping with NLTK
 
Rethink programming: a functional approach
Rethink programming: a functional approachRethink programming: a functional approach
Rethink programming: a functional approach
 
Yahoo answers
Yahoo answersYahoo answers
Yahoo answers
 
Future of ai on the jvm
Future of ai on the jvmFuture of ai on the jvm
Future of ai on the jvm
 
The Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text AnalyticsThe Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text Analytics
 
Nltk natural language toolkit overview and application @ PyHug
Nltk  natural language toolkit overview and application @ PyHugNltk  natural language toolkit overview and application @ PyHug
Nltk natural language toolkit overview and application @ PyHug
 
KiwiPyCon 2014 - NLP with Python tutorial
KiwiPyCon 2014 - NLP with Python tutorialKiwiPyCon 2014 - NLP with Python tutorial
KiwiPyCon 2014 - NLP with Python tutorial
 
Overview of text mining and NLP (+software)
Overview of text mining and NLP (+software)Overview of text mining and NLP (+software)
Overview of text mining and NLP (+software)
 
Natural language processing (NLP) introduction
Natural language processing (NLP) introductionNatural language processing (NLP) introduction
Natural language processing (NLP) introduction
 

Similar to Basic NLP with Python and NLTK

仕事で使うF#
仕事で使うF#仕事で使うF#
仕事で使うF#
bleis tift
 
Profiling and optimization
Profiling and optimizationProfiling and optimization
Profiling and optimization
g3_nittala
 
Lecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptxLecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptx
jovannyflex
 
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docximport os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
Blake0FxCampbelld
 
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
Yashpatel821746
 
Python Performance 101
Python Performance 101Python Performance 101
Python Performance 101
Ankur Gupta
 

Similar to Basic NLP with Python and NLTK (20)

GE8151 Problem Solving and Python Programming
GE8151 Problem Solving and Python ProgrammingGE8151 Problem Solving and Python Programming
GE8151 Problem Solving and Python Programming
 
仕事で使うF#
仕事で使うF#仕事で使うF#
仕事で使うF#
 
Python basic
Python basic Python basic
Python basic
 
Class 2: Welcome part 2
Class 2: Welcome part 2Class 2: Welcome part 2
Class 2: Welcome part 2
 
Are we ready to Go?
Are we ready to Go?Are we ready to Go?
Are we ready to Go?
 
The Ring programming language version 1.5.3 book - Part 25 of 184
The Ring programming language version 1.5.3 book - Part 25 of 184The Ring programming language version 1.5.3 book - Part 25 of 184
The Ring programming language version 1.5.3 book - Part 25 of 184
 
Music as data
Music as dataMusic as data
Music as data
 
Profiling and optimization
Profiling and optimizationProfiling and optimization
Profiling and optimization
 
Python crush course
Python crush coursePython crush course
Python crush course
 
Introduction to Python Programming | InsideAIML
Introduction to Python Programming | InsideAIMLIntroduction to Python Programming | InsideAIML
Introduction to Python Programming | InsideAIML
 
Lecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptxLecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptx
 
Lecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptxLecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptx
 
Something about Golang
Something about GolangSomething about Golang
Something about Golang
 
Clojure Intro
Clojure IntroClojure Intro
Clojure Intro
 
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docximport os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
 
Python
PythonPython
Python
 
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
 
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
 
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
 
Python Performance 101
Python Performance 101Python Performance 101
Python Performance 101
 

Recently uploaded

+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...
+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...
+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...
Health
 
怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制
怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制
怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制
vexqp
 
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
nirzagarg
 
Abortion pills in Jeddah | +966572737505 | Get Cytotec
Abortion pills in Jeddah | +966572737505 | Get CytotecAbortion pills in Jeddah | +966572737505 | Get Cytotec
Abortion pills in Jeddah | +966572737505 | Get Cytotec
Abortion pills in Riyadh +966572737505 get cytotec
 
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
gajnagarg
 
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi ArabiaIn Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
ahmedjiabur940
 
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
vexqp
 
Gartner's Data Analytics Maturity Model.pptx
Gartner's Data Analytics Maturity Model.pptxGartner's Data Analytics Maturity Model.pptx
Gartner's Data Analytics Maturity Model.pptx
chadhar227
 
怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制
怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制
怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制
vexqp
 
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
gajnagarg
 
Jual Cytotec Asli Obat Aborsi No. 1 Paling Manjur
Jual Cytotec Asli Obat Aborsi No. 1 Paling ManjurJual Cytotec Asli Obat Aborsi No. 1 Paling Manjur
Jual Cytotec Asli Obat Aborsi No. 1 Paling Manjur
ptikerjasaptiker
 
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
gajnagarg
 
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
nirzagarg
 

Recently uploaded (20)

Digital Transformation Playbook by Graham Ware
Digital Transformation Playbook by Graham WareDigital Transformation Playbook by Graham Ware
Digital Transformation Playbook by Graham Ware
 
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
 
+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...
+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...
+97470301568>>weed for sale in qatar ,weed for sale in dubai,weed for sale in...
 
怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制
怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制
怎样办理旧金山城市学院毕业证(CCSF毕业证书)成绩单学校原版复制
 
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
 
Ranking and Scoring Exercises for Research
Ranking and Scoring Exercises for ResearchRanking and Scoring Exercises for Research
Ranking and Scoring Exercises for Research
 
Abortion pills in Jeddah | +966572737505 | Get Cytotec
Abortion pills in Jeddah | +966572737505 | Get CytotecAbortion pills in Jeddah | +966572737505 | Get Cytotec
Abortion pills in Jeddah | +966572737505 | Get Cytotec
 
Capstone in Interprofessional Informatic // IMPACT OF COVID 19 ON EDUCATION
Capstone in Interprofessional Informatic  // IMPACT OF COVID 19 ON EDUCATIONCapstone in Interprofessional Informatic  // IMPACT OF COVID 19 ON EDUCATION
Capstone in Interprofessional Informatic // IMPACT OF COVID 19 ON EDUCATION
 
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
 
Discover Why Less is More in B2B Research
Discover Why Less is More in B2B ResearchDiscover Why Less is More in B2B Research
Discover Why Less is More in B2B Research
 
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi ArabiaIn Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
 
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
 
Gartner's Data Analytics Maturity Model.pptx
Gartner's Data Analytics Maturity Model.pptxGartner's Data Analytics Maturity Model.pptx
Gartner's Data Analytics Maturity Model.pptx
 
怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制
怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制
怎样办理伦敦大学城市学院毕业证(CITY毕业证书)成绩单学校原版复制
 
Aspirational Block Program Block Syaldey District - Almora
Aspirational Block Program Block Syaldey District - AlmoraAspirational Block Program Block Syaldey District - Almora
Aspirational Block Program Block Syaldey District - Almora
 
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
 
Jual Cytotec Asli Obat Aborsi No. 1 Paling Manjur
Jual Cytotec Asli Obat Aborsi No. 1 Paling ManjurJual Cytotec Asli Obat Aborsi No. 1 Paling Manjur
Jual Cytotec Asli Obat Aborsi No. 1 Paling Manjur
 
Switzerland Constitution 2002.pdf.........
Switzerland Constitution 2002.pdf.........Switzerland Constitution 2002.pdf.........
Switzerland Constitution 2002.pdf.........
 
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
 
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
 

Basic NLP with Python and NLTK

  • 1. Basic NLP with Python and NLTK Bruni Francesco (@brunifrancesco) Download the original iPython notebook @ https://github.com/brunifrancesco/nltk_base.git
  • 2. Python - Programming language - Multi-paradigm - Easy to learn - Suitable for multiple needs - Multiple implementations, a ton of useful libraries
  • 3. Basic Python import random a_number = 1 a_string = "Python rocks!" a_list = ["1", "2", "3"] a_dict = {"film":"Pulp fiction", "francesco": "Python"} print(a_dict.values()) a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1} print(len(a_dict_of_list["key"])) a_tuple = ("Goodfellas", "Kill Bill",) a_list.append(4)
  • 4. Creating functions def super_function(number): return number * 2 def factorial(n): if n == 0: return 1 else: return n*factorial(n-1) double = lambda item: item * 2 predicate = lambda item: item > 3 assert super_function(3) == 6 assert factorial(3) == 6 assert double(3) == 6 assert list(filter(predicate, [1,2,5,3])) == [5]
  • 5. And much more - Object oriented paradigm --> classes, metaclasses etc. etc. - Functional programming paradigm --> partials, closures, high order functions etc. etc. - Scripting paradigm --> shell control, os related functions etc.. - Async ops support --> asyncio
  • 6. Reading files with open("file", "r") as input: data = input.read() import csv def read_csv(): with open('data.csv', 'r') as francesco: data = csv.reader(francesco, delimiter=';') for element in data: print(element[1]) read_csv()
  • 7. Make data talk from collections import Counter import statistics splitted_chunks = data.split() print("Data lenght: %s" %len(data)) print("Chunks numbers: %s" %len(splitted_chunks)) print("Unique chunks: %s" %len(set(splitted_chunks))) print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks))) print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks))) print("Frequency distribution: %s" % sorted(filter(lambda item: item[1] > 5, Counter(splitted_chunks).items()), key=lambda item: item[1]))
  • 8. NLTK - tokenization - stemming - tagging - parsing - semantic reasoning - classification
  • 9. Tokenizing from nltk import word_tokenize tokens = word_tokenize(data) from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True) s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' tw_tokens = tokenizer.tokenize(s1) print(tw_tokens)
  • 10. Frequency distribution from nltk.book import FreqDist fdist1 = FreqDist(splitted_chunks) most_common = fdist1.most_common(50) fdist1.plot(50, cumulative=True) fdist1.plot(10) print("Max frequency key: %s" %fdist1.max()) print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"]) print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
  • 11. Cleaning data from nltk.corpus import stopwords def remove_stopword(word): return word not in words import string words = stopwords.words('italian') lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks)) print("Chunks lenght %s" %len(lowered_chunks)) clean_chunks = list(filter(remove_stopword, splitted_chunks)) print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks)) clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks)) print("Cleaned chunks (without punctuation and stopwords) lenght: %s" %len(clean_chunks)) from nltk.book import FreqDist fdist1 = FreqDist(clean_chunks) most_common = fdist1.most_common(50)
  • 12. Stemming from nltk.stem.porter import * from nltk.stem.snowball import * stemmer = PorterStemmer() stemmer.stem(“activities") available_langs = SnowballStemmer.languages sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True) print(sn_stemmer.stem("ordenador")) from nltk.stem.lancaster import * LancasterStemmer().stem("activities")
  • 13. Custom ngrams finder def find_and_analyze_ngrams(tagged_sent): chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): yield probable_ngram
  • 14. Classifying data def __get_elements_for_classification(self, lfeats, train_number, classifying=True): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): if classifying: train_feats.extend([(feat, label) for feat in feats]) else: cutoff = train_number * len(feats)/10 train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]]) nb_classifier = NaiveBayesClassifier.train(train_feats) return train_feats, test_feats, nb_classifier
  • 15. Pointwise Mutual Information PMI(X = x, Y = y) = log p(X = x, Y = y) p(X = x)p(Y = y)
  • 16. Measure PMI - Read from csv - Preprocess data (tokenize, lower, remove stopwords, punctuation) - Find frequency distribution for unigrams - Find frequency distribution for bigrams - Compute PMI via implemented function - Let NLTK sort bigrams by PMI metric - Write result to CSV file
  • 17. Read data import nltk from nltk.corpus import stopwords import string import random from itertools import chain import math import csv import time def read_data(): """ Read data 'libe by line'""" with open('data.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: yield row
  • 18. Preprocess def preprocess(data): """ Preprocess data, filtering out stopwords, punctuation and lowering all splitted tokens :param data: the string data to be processed """ italian_stopwords = stopwords.words('italian') splitted_chunks = data.split() lowered_chunks = (item.lower() for item in splitted_chunks) chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in string.punctuation) chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in italian_stopwords) return list(chunks_without_stopwords)
  • 19. Find N-Grams FREQUENCY_TRESHOLD = 2 def find_bigrams(splitted_chunks): """ Find bigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks) bigrams.apply_freq_filter(FREQUENCY_TRESHOLD) return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()} def find_unigrams(splitted_chunks): """ Find unigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ unigrams = nltk.FreqDist(splitted_chunks) return {unigram: freq for unigram, freq in unigrams.items() if freq > FREQUENCY_TRESHOLD - 1}
  • 20. Compute PMI def pmi(word1, word2, unigram_freq, bigram_freq): """ Find PMI measure :param word1: the first word :param word2: the second word :param unigram_freq: the unigram frequency container :param bigram_freq: the bigram frequency container """ prob_word1 = unigram_freq[word1] / sum(unigram_freq.values()) prob_word2 = unigram_freq[word2] / sum(unigram_freq.values()) prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values()) a = prob_word1_word2/prob_word1*prob_word2 return round(math.log(a,2),2)
  • 21. Write result to CSV def write_data(result): """ Write result to CSV file :param result: the list to be written to csv file """ with open("result.csv", "a") as output: writer = csv.writer(output, delimiter='*') for row in result: writer.writerow(row)