SlideShare a Scribd company logo
Basic NLP with Python
and NLTK
Bruni Francesco (@brunifrancesco)
Download the original iPython notebook @
https://github.com/brunifrancesco/nltk_base.git
Python
- Programming language
- Multi-paradigm
- Easy to learn
- Suitable for multiple needs
- Multiple implementations, a ton of useful libraries
Basic Python
import random
a_number = 1
a_string = "Python rocks!"
a_list = ["1", "2", "3"]
a_dict = {"film":"Pulp fiction", "francesco": "Python"}
print(a_dict.values())
a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1}
print(len(a_dict_of_list["key"]))
a_tuple = ("Goodfellas", "Kill Bill",)
a_list.append(4)
Creating functions
def super_function(number):
return number * 2
def factorial(n):
if n == 0: return 1
else: return n*factorial(n-1)
double = lambda item: item * 2
predicate = lambda item: item > 3
assert super_function(3) == 6
assert factorial(3) == 6
assert double(3) == 6
assert list(filter(predicate, [1,2,5,3])) == [5]
And much more
- Object oriented paradigm --> classes, metaclasses etc. etc.
- Functional programming paradigm --> partials, closures, high
order functions etc. etc.
- Scripting paradigm --> shell control, os related functions etc..
- Async ops support --> asyncio
Reading files
with open("file", "r") as input:
data = input.read()
import csv
def read_csv():
with open('data.csv', 'r') as francesco:
data = csv.reader(francesco, delimiter=';')
for element in data:
print(element[1])
read_csv()
Make data talk
from collections import Counter
import statistics
splitted_chunks = data.split()
print("Data lenght: %s" %len(data))
print("Chunks numbers: %s" %len(splitted_chunks))
print("Unique chunks: %s" %len(set(splitted_chunks)))
print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks)))
print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks)))
print("Frequency distribution: %s" %
sorted(filter(lambda item: item[1] > 5,
Counter(splitted_chunks).items()), key=lambda item: item[1]))
NLTK
- tokenization
- stemming
- tagging
- parsing
- semantic reasoning
- classification
Tokenizing
from nltk import word_tokenize
tokens = word_tokenize(data)
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True)
s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
tw_tokens = tokenizer.tokenize(s1)
print(tw_tokens)
Frequency distribution
from nltk.book import FreqDist
fdist1 = FreqDist(splitted_chunks)
most_common = fdist1.most_common(50)
fdist1.plot(50, cumulative=True)
fdist1.plot(10)
print("Max frequency key: %s" %fdist1.max())
print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"])
print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
Cleaning data
from nltk.corpus import stopwords
def remove_stopword(word):
return word not in words
import string
words = stopwords.words('italian')
lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks))
print("Chunks lenght %s" %len(lowered_chunks))
clean_chunks = list(filter(remove_stopword, splitted_chunks))
print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks))
clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks))
print("Cleaned chunks (without punctuation and stopwords) lenght: %s"
%len(clean_chunks))
from nltk.book import FreqDist
fdist1 = FreqDist(clean_chunks)
most_common = fdist1.most_common(50)
Stemming
from nltk.stem.porter import *
from nltk.stem.snowball import *
stemmer = PorterStemmer()
stemmer.stem(“activities")
available_langs = SnowballStemmer.languages
sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True)
print(sn_stemmer.stem("ordenador"))
from nltk.stem.lancaster import *
LancasterStemmer().stem("activities")
Custom ngrams finder
def find_and_analyze_ngrams(tagged_sent):
chunker = RegexpParser(CHUNK_RULE)
tree = chunker.parse(tagged_sent)
for item in self.__leaves(tree):
if not item == tagged_sent:
probable_ngram = ' '.join(self.__stemmer.stem(
word.lower()) for (word, pos) in item
)
if self.__evaluate_polarity_ngram(probable_ngram):
yield probable_ngram
Classifying data
def __get_elements_for_classification(self, lfeats, train_number, classifying=True):
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
if classifying:
train_feats.extend([(feat, label) for feat in feats])
else:
cutoff = train_number * len(feats)/10
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
nb_classifier = NaiveBayesClassifier.train(train_feats)
return train_feats, test_feats, nb_classifier
Pointwise Mutual
Information
PMI(X = x, Y = y) = log
p(X = x, Y = y)
p(X = x)p(Y = y)
Measure PMI
- Read from csv
- Preprocess data (tokenize, lower, remove stopwords, punctuation)
- Find frequency distribution for unigrams
- Find frequency distribution for bigrams
- Compute PMI via implemented function
- Let NLTK sort bigrams by PMI metric
- Write result to CSV file
Read data
import nltk
from nltk.corpus import stopwords
import string
import random
from itertools import chain
import math
import csv
import time
def read_data():
"""
Read data 'libe by line'"""
with open('data.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
yield row
Preprocess
def preprocess(data):
"""
Preprocess data, filtering out stopwords, punctuation and lowering
all splitted tokens
:param data: the string data to be processed
"""
italian_stopwords = stopwords.words('italian')
splitted_chunks = data.split()
lowered_chunks = (item.lower() for item in splitted_chunks)
chunks_without_punctuation = (chunk for chunk in lowered_chunks
if chunk not in string.punctuation)
chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation
if chunk not in italian_stopwords)
return list(chunks_without_stopwords)
Find N-Grams
FREQUENCY_TRESHOLD = 2
def find_bigrams(splitted_chunks):
"""
Find bigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks)
bigrams.apply_freq_filter(FREQUENCY_TRESHOLD)
return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()}
def find_unigrams(splitted_chunks):
"""
Find unigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
unigrams = nltk.FreqDist(splitted_chunks)
return {unigram: freq for unigram, freq in unigrams.items()
if freq > FREQUENCY_TRESHOLD - 1}
Compute PMI
def pmi(word1, word2, unigram_freq, bigram_freq):
"""
Find PMI measure
:param word1: the first word
:param word2: the second word
:param unigram_freq: the unigram frequency container
:param bigram_freq: the bigram frequency container
"""
prob_word1 = unigram_freq[word1] / sum(unigram_freq.values())
prob_word2 = unigram_freq[word2] / sum(unigram_freq.values())
prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values())
a = prob_word1_word2/prob_word1*prob_word2
return round(math.log(a,2),2)
Write result to CSV
def write_data(result):
"""
Write result to CSV file
:param result: the list to be written to csv file
"""
with open("result.csv", "a") as output:
writer = csv.writer(output, delimiter='*')
for row in result:
writer.writerow(row)
Happy coding :)

More Related Content

What's hot

Python in 90mins
Python in 90minsPython in 90mins
Python in 90mins
Larry Cai
 
Python for Penetration testers
Python for Penetration testersPython for Penetration testers
Python for Penetration testers
Christian Martorella
 
Procesamiento del lenguaje natural con python
Procesamiento del lenguaje natural con pythonProcesamiento del lenguaje natural con python
Procesamiento del lenguaje natural con python
Facultad de Ciencias y Sistemas
 
Good Code
Good CodeGood Code
Good Code
Kevlin Henney
 
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary dataKernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Anne Nicolas
 
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RIThe Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
Eleanor McHugh
 
Snakes for Camels
Snakes for CamelsSnakes for Camels
Snakes for Camels
miquelruizm
 
Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014
Fasihul Kabir
 
Whispered secrets
Whispered secretsWhispered secrets
Whispered secrets
Eleanor McHugh
 
Airlover 20030324 1
Airlover 20030324 1Airlover 20030324 1
Airlover 20030324 1
Dr.Ravi
 
Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout
source{d}
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 Minutes
Matt Harrison
 
Introduction to advanced python
Introduction to advanced pythonIntroduction to advanced python
Introduction to advanced python
Charles-Axel Dein
 
Introduction to Python
Introduction to PythonIntroduction to Python
Introduction to Python
KHNOG
 
Twitter Author Prediction from Tweets using Bayesian Network
Twitter Author Prediction from Tweets using Bayesian NetworkTwitter Author Prediction from Tweets using Bayesian Network
Twitter Author Prediction from Tweets using Bayesian Network
Hendy Irawan
 
Implementing Software Machines in Go and C
Implementing Software Machines in Go and CImplementing Software Machines in Go and C
Implementing Software Machines in Go and C
Eleanor McHugh
 
Using Unix
Using UnixUsing Unix
Using Unix
Dr.Ravi
 
Encrypt all transports
Encrypt all transportsEncrypt all transports
Encrypt all transports
Eleanor McHugh
 
The Ring programming language version 1.7 book - Part 43 of 196
The Ring programming language version 1.7 book - Part 43 of 196The Ring programming language version 1.7 book - Part 43 of 196
The Ring programming language version 1.7 book - Part 43 of 196
Mahmoud Samir Fayed
 
Talk Unix Shell Script
Talk Unix Shell ScriptTalk Unix Shell Script
Talk Unix Shell Script
Dr.Ravi
 

What's hot (20)

Python in 90mins
Python in 90minsPython in 90mins
Python in 90mins
 
Python for Penetration testers
Python for Penetration testersPython for Penetration testers
Python for Penetration testers
 
Procesamiento del lenguaje natural con python
Procesamiento del lenguaje natural con pythonProcesamiento del lenguaje natural con python
Procesamiento del lenguaje natural con python
 
Good Code
Good CodeGood Code
Good Code
 
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary dataKernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
 
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RIThe Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
The Ruby Guide to *nix Plumbing: on the quest for efficiency with Ruby [M|K]RI
 
Snakes for Camels
Snakes for CamelsSnakes for Camels
Snakes for Camels
 
Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014Nltk:a tool for_nlp - py_con-dhaka-2014
Nltk:a tool for_nlp - py_con-dhaka-2014
 
Whispered secrets
Whispered secretsWhispered secrets
Whispered secrets
 
Airlover 20030324 1
Airlover 20030324 1Airlover 20030324 1
Airlover 20030324 1
 
Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 Minutes
 
Introduction to advanced python
Introduction to advanced pythonIntroduction to advanced python
Introduction to advanced python
 
Introduction to Python
Introduction to PythonIntroduction to Python
Introduction to Python
 
Twitter Author Prediction from Tweets using Bayesian Network
Twitter Author Prediction from Tweets using Bayesian NetworkTwitter Author Prediction from Tweets using Bayesian Network
Twitter Author Prediction from Tweets using Bayesian Network
 
Implementing Software Machines in Go and C
Implementing Software Machines in Go and CImplementing Software Machines in Go and C
Implementing Software Machines in Go and C
 
Using Unix
Using UnixUsing Unix
Using Unix
 
Encrypt all transports
Encrypt all transportsEncrypt all transports
Encrypt all transports
 
The Ring programming language version 1.7 book - Part 43 of 196
The Ring programming language version 1.7 book - Part 43 of 196The Ring programming language version 1.7 book - Part 43 of 196
The Ring programming language version 1.7 book - Part 43 of 196
 
Talk Unix Shell Script
Talk Unix Shell ScriptTalk Unix Shell Script
Talk Unix Shell Script
 

Viewers also liked

NLTK - Natural Language Processing in Python
NLTK - Natural Language Processing in PythonNLTK - Natural Language Processing in Python
NLTK - Natural Language Processing in Python
shanbady
 
Practical Natural Language Processing
Practical Natural Language ProcessingPractical Natural Language Processing
Practical Natural Language Processing
Jaganadh Gopinadhan
 
Continuous Integration/Deployment with Docker and Jenkins
Continuous Integration/Deployment with Docker and JenkinsContinuous Integration/Deployment with Docker and Jenkins
Continuous Integration/Deployment with Docker and Jenkins
Francesco Bruni
 
Sentiment analysis-by-nltk
Sentiment analysis-by-nltkSentiment analysis-by-nltk
Sentiment analysis-by-nltk
Wei-Ting Kuo
 
Natural Language Processing with Python
Natural Language Processing with PythonNatural Language Processing with Python
Natural Language Processing with Python
Benjamin Bengfort
 
Natural Language Processing
Natural Language ProcessingNatural Language Processing
Natural Language Processing
Jaganadh Gopinadhan
 
Open street map
Open street mapOpen street map
Open street map
Michelantonio Trizio
 
ZOETWITT in the Press
ZOETWITT in the PressZOETWITT in the Press
ZOETWITT in the Press
zoetwitt
 
Python & Stuff
Python & StuffPython & Stuff
Python & Stuff
Jacob Perkins
 
Nd4 j slides.pptx
Nd4 j slides.pptxNd4 j slides.pptx
Nd4 j slides.pptx
Adam Gibson
 
Introduction to Functional Programming
Introduction to Functional ProgrammingIntroduction to Functional Programming
Introduction to Functional Programming
Francesco Bruni
 
Corpus Bootstrapping with NLTK
Corpus Bootstrapping with NLTKCorpus Bootstrapping with NLTK
Corpus Bootstrapping with NLTK
Jacob Perkins
 
Rethink programming: a functional approach
Rethink programming: a functional approachRethink programming: a functional approach
Rethink programming: a functional approach
Francesco Bruni
 
Yahoo answers
Yahoo answersYahoo answers
Yahoo answers
discoversudhir
 
Future of ai on the jvm
Future of ai on the jvmFuture of ai on the jvm
Future of ai on the jvm
Adam Gibson
 
The Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text AnalyticsThe Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text Analytics
Alyona Medelyan
 
Nltk natural language toolkit overview and application @ PyHug
Nltk  natural language toolkit overview and application @ PyHugNltk  natural language toolkit overview and application @ PyHug
Nltk natural language toolkit overview and application @ PyHug
Jimmy Lai
 
KiwiPyCon 2014 - NLP with Python tutorial
KiwiPyCon 2014 - NLP with Python tutorialKiwiPyCon 2014 - NLP with Python tutorial
KiwiPyCon 2014 - NLP with Python tutorial
Alyona Medelyan
 
Overview of text mining and NLP (+software)
Overview of text mining and NLP (+software)Overview of text mining and NLP (+software)
Overview of text mining and NLP (+software)
Florian Leitner
 
Natural language processing (NLP) introduction
Natural language processing (NLP) introductionNatural language processing (NLP) introduction
Natural language processing (NLP) introduction
Robert Lujo
 

Viewers also liked (20)

NLTK - Natural Language Processing in Python
NLTK - Natural Language Processing in PythonNLTK - Natural Language Processing in Python
NLTK - Natural Language Processing in Python
 
Practical Natural Language Processing
Practical Natural Language ProcessingPractical Natural Language Processing
Practical Natural Language Processing
 
Continuous Integration/Deployment with Docker and Jenkins
Continuous Integration/Deployment with Docker and JenkinsContinuous Integration/Deployment with Docker and Jenkins
Continuous Integration/Deployment with Docker and Jenkins
 
Sentiment analysis-by-nltk
Sentiment analysis-by-nltkSentiment analysis-by-nltk
Sentiment analysis-by-nltk
 
Natural Language Processing with Python
Natural Language Processing with PythonNatural Language Processing with Python
Natural Language Processing with Python
 
Natural Language Processing
Natural Language ProcessingNatural Language Processing
Natural Language Processing
 
Open street map
Open street mapOpen street map
Open street map
 
ZOETWITT in the Press
ZOETWITT in the PressZOETWITT in the Press
ZOETWITT in the Press
 
Python & Stuff
Python & StuffPython & Stuff
Python & Stuff
 
Nd4 j slides.pptx
Nd4 j slides.pptxNd4 j slides.pptx
Nd4 j slides.pptx
 
Introduction to Functional Programming
Introduction to Functional ProgrammingIntroduction to Functional Programming
Introduction to Functional Programming
 
Corpus Bootstrapping with NLTK
Corpus Bootstrapping with NLTKCorpus Bootstrapping with NLTK
Corpus Bootstrapping with NLTK
 
Rethink programming: a functional approach
Rethink programming: a functional approachRethink programming: a functional approach
Rethink programming: a functional approach
 
Yahoo answers
Yahoo answersYahoo answers
Yahoo answers
 
Future of ai on the jvm
Future of ai on the jvmFuture of ai on the jvm
Future of ai on the jvm
 
The Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text AnalyticsThe Next Generation SharePoint: Powered by Text Analytics
The Next Generation SharePoint: Powered by Text Analytics
 
Nltk natural language toolkit overview and application @ PyHug
Nltk  natural language toolkit overview and application @ PyHugNltk  natural language toolkit overview and application @ PyHug
Nltk natural language toolkit overview and application @ PyHug
 
KiwiPyCon 2014 - NLP with Python tutorial
KiwiPyCon 2014 - NLP with Python tutorialKiwiPyCon 2014 - NLP with Python tutorial
KiwiPyCon 2014 - NLP with Python tutorial
 
Overview of text mining and NLP (+software)
Overview of text mining and NLP (+software)Overview of text mining and NLP (+software)
Overview of text mining and NLP (+software)
 
Natural language processing (NLP) introduction
Natural language processing (NLP) introductionNatural language processing (NLP) introduction
Natural language processing (NLP) introduction
 

Similar to Basic NLP with Python and NLTK

GE8151 Problem Solving and Python Programming
GE8151 Problem Solving and Python ProgrammingGE8151 Problem Solving and Python Programming
GE8151 Problem Solving and Python Programming
Muthu Vinayagam
 
仕事で使うF#
仕事で使うF#仕事で使うF#
仕事で使うF#
bleis tift
 
Python basic
Python basic Python basic
Python basic
sewoo lee
 
Class 2: Welcome part 2
Class 2: Welcome part 2Class 2: Welcome part 2
Class 2: Welcome part 2
Marc Gouw
 
Are we ready to Go?
Are we ready to Go?Are we ready to Go?
Are we ready to Go?
Adam Dudczak
 
The Ring programming language version 1.5.3 book - Part 25 of 184
The Ring programming language version 1.5.3 book - Part 25 of 184The Ring programming language version 1.5.3 book - Part 25 of 184
The Ring programming language version 1.5.3 book - Part 25 of 184
Mahmoud Samir Fayed
 
Music as data
Music as dataMusic as data
Music as data
John Vlachoyiannis
 
Profiling and optimization
Profiling and optimizationProfiling and optimization
Profiling and optimization
g3_nittala
 
Python crush course
Python crush coursePython crush course
Python crush course
Mohammed El Rafie Tarabay
 
Introduction to Python Programming | InsideAIML
Introduction to Python Programming | InsideAIMLIntroduction to Python Programming | InsideAIML
Introduction to Python Programming | InsideAIML
VijaySharma802
 
Lecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptxLecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptx
jovannyflex
 
Lecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptxLecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptx
jovannyflex
 
Something about Golang
Something about GolangSomething about Golang
Something about Golang
Anton Arhipov
 
Clojure Intro
Clojure IntroClojure Intro
Clojure Intro
thnetos
 
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docximport os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
Blake0FxCampbelld
 
Python
PythonPython
Python
대갑 김
 
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
Yashpatel821746
 
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
Yashpatel821746
 
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
Yashpatel821746
 
Python Performance 101
Python Performance 101Python Performance 101
Python Performance 101
Ankur Gupta
 

Similar to Basic NLP with Python and NLTK (20)

GE8151 Problem Solving and Python Programming
GE8151 Problem Solving and Python ProgrammingGE8151 Problem Solving and Python Programming
GE8151 Problem Solving and Python Programming
 
仕事で使うF#
仕事で使うF#仕事で使うF#
仕事で使うF#
 
Python basic
Python basic Python basic
Python basic
 
Class 2: Welcome part 2
Class 2: Welcome part 2Class 2: Welcome part 2
Class 2: Welcome part 2
 
Are we ready to Go?
Are we ready to Go?Are we ready to Go?
Are we ready to Go?
 
The Ring programming language version 1.5.3 book - Part 25 of 184
The Ring programming language version 1.5.3 book - Part 25 of 184The Ring programming language version 1.5.3 book - Part 25 of 184
The Ring programming language version 1.5.3 book - Part 25 of 184
 
Music as data
Music as dataMusic as data
Music as data
 
Profiling and optimization
Profiling and optimizationProfiling and optimization
Profiling and optimization
 
Python crush course
Python crush coursePython crush course
Python crush course
 
Introduction to Python Programming | InsideAIML
Introduction to Python Programming | InsideAIMLIntroduction to Python Programming | InsideAIML
Introduction to Python Programming | InsideAIML
 
Lecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptxLecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptx
 
Lecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptxLecture 5 – Computing with Numbers (Math Lib).pptx
Lecture 5 – Computing with Numbers (Math Lib).pptx
 
Something about Golang
Something about GolangSomething about Golang
Something about Golang
 
Clojure Intro
Clojure IntroClojure Intro
Clojure Intro
 
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docximport os import matplotlib-pyplot as plt import pandas as pd import r.docx
import os import matplotlib-pyplot as plt import pandas as pd import r.docx
 
Python
PythonPython
Python
 
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
8799.pdfOr else the work is fine only. Lot to learn buddy.... Improve your ba...
 
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
Or else the work is fine only. Lot to learn buddy.... Improve your basics in ...
 
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
PYTHONOr else the work is fine only. Lot to learn buddy.... Improve your basi...
 
Python Performance 101
Python Performance 101Python Performance 101
Python Performance 101
 

Recently uploaded

原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理
原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理
原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理
wyddcwye1
 
End-to-end pipeline agility - Berlin Buzzwords 2024
End-to-end pipeline agility - Berlin Buzzwords 2024End-to-end pipeline agility - Berlin Buzzwords 2024
End-to-end pipeline agility - Berlin Buzzwords 2024
Lars Albertsson
 
Challenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more importantChallenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more important
Sm321
 
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
bopyb
 
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
Timothy Spann
 
一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理
一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理
一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理
xclpvhuk
 
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data LakeViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
Walaa Eldin Moustafa
 
DSSML24_tspann_CodelessGenerativeAIPipelines
DSSML24_tspann_CodelessGenerativeAIPipelinesDSSML24_tspann_CodelessGenerativeAIPipelines
DSSML24_tspann_CodelessGenerativeAIPipelines
Timothy Spann
 
Palo Alto Cortex XDR presentation .......
Palo Alto Cortex XDR presentation .......Palo Alto Cortex XDR presentation .......
Palo Alto Cortex XDR presentation .......
Sachin Paul
 
Global Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headedGlobal Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headed
vikram sood
 
writing report business partner b1+ .pdf
writing report business partner b1+ .pdfwriting report business partner b1+ .pdf
writing report business partner b1+ .pdf
VyNguyen709676
 
Experts live - Improving user adoption with AI
Experts live - Improving user adoption with AIExperts live - Improving user adoption with AI
Experts live - Improving user adoption with AI
jitskeb
 
A presentation that explain the Power BI Licensing
A presentation that explain the Power BI LicensingA presentation that explain the Power BI Licensing
A presentation that explain the Power BI Licensing
AlessioFois2
 
Population Growth in Bataan: The effects of population growth around rural pl...
Population Growth in Bataan: The effects of population growth around rural pl...Population Growth in Bataan: The effects of population growth around rural pl...
Population Growth in Bataan: The effects of population growth around rural pl...
Bill641377
 
"Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens"
"Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens""Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens"
"Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens"
sameer shah
 
原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样
原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样
原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样
ihavuls
 
一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理
一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理
一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理
y3i0qsdzb
 
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
v7oacc3l
 
原版一比一多伦多大学毕业证(UofT毕业证书)如何办理
原版一比一多伦多大学毕业证(UofT毕业证书)如何办理原版一比一多伦多大学毕业证(UofT毕业证书)如何办理
原版一比一多伦多大学毕业证(UofT毕业证书)如何办理
mkkikqvo
 
Intelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicineIntelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicine
AndrzejJarynowski
 

Recently uploaded (20)

原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理
原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理
原版一比一利兹贝克特大学毕业证(LeedsBeckett毕业证书)如何办理
 
End-to-end pipeline agility - Berlin Buzzwords 2024
End-to-end pipeline agility - Berlin Buzzwords 2024End-to-end pipeline agility - Berlin Buzzwords 2024
End-to-end pipeline agility - Berlin Buzzwords 2024
 
Challenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more importantChallenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more important
 
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
 
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
 
一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理
一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理
一比一原版(Unimelb毕业证书)墨尔本大学毕业证如何办理
 
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data LakeViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
 
DSSML24_tspann_CodelessGenerativeAIPipelines
DSSML24_tspann_CodelessGenerativeAIPipelinesDSSML24_tspann_CodelessGenerativeAIPipelines
DSSML24_tspann_CodelessGenerativeAIPipelines
 
Palo Alto Cortex XDR presentation .......
Palo Alto Cortex XDR presentation .......Palo Alto Cortex XDR presentation .......
Palo Alto Cortex XDR presentation .......
 
Global Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headedGlobal Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headed
 
writing report business partner b1+ .pdf
writing report business partner b1+ .pdfwriting report business partner b1+ .pdf
writing report business partner b1+ .pdf
 
Experts live - Improving user adoption with AI
Experts live - Improving user adoption with AIExperts live - Improving user adoption with AI
Experts live - Improving user adoption with AI
 
A presentation that explain the Power BI Licensing
A presentation that explain the Power BI LicensingA presentation that explain the Power BI Licensing
A presentation that explain the Power BI Licensing
 
Population Growth in Bataan: The effects of population growth around rural pl...
Population Growth in Bataan: The effects of population growth around rural pl...Population Growth in Bataan: The effects of population growth around rural pl...
Population Growth in Bataan: The effects of population growth around rural pl...
 
"Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens"
"Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens""Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens"
"Financial Odyssey: Navigating Past Performance Through Diverse Analytical Lens"
 
原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样
原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样
原版制作(unimelb毕业证书)墨尔本大学毕业证Offer一模一样
 
一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理
一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理
一比一原版巴斯大学毕业证(Bath毕业证书)学历如何办理
 
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
 
原版一比一多伦多大学毕业证(UofT毕业证书)如何办理
原版一比一多伦多大学毕业证(UofT毕业证书)如何办理原版一比一多伦多大学毕业证(UofT毕业证书)如何办理
原版一比一多伦多大学毕业证(UofT毕业证书)如何办理
 
Intelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicineIntelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicine
 

Basic NLP with Python and NLTK

  • 1. Basic NLP with Python and NLTK Bruni Francesco (@brunifrancesco) Download the original iPython notebook @ https://github.com/brunifrancesco/nltk_base.git
  • 2. Python - Programming language - Multi-paradigm - Easy to learn - Suitable for multiple needs - Multiple implementations, a ton of useful libraries
  • 3. Basic Python import random a_number = 1 a_string = "Python rocks!" a_list = ["1", "2", "3"] a_dict = {"film":"Pulp fiction", "francesco": "Python"} print(a_dict.values()) a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1} print(len(a_dict_of_list["key"])) a_tuple = ("Goodfellas", "Kill Bill",) a_list.append(4)
  • 4. Creating functions def super_function(number): return number * 2 def factorial(n): if n == 0: return 1 else: return n*factorial(n-1) double = lambda item: item * 2 predicate = lambda item: item > 3 assert super_function(3) == 6 assert factorial(3) == 6 assert double(3) == 6 assert list(filter(predicate, [1,2,5,3])) == [5]
  • 5. And much more - Object oriented paradigm --> classes, metaclasses etc. etc. - Functional programming paradigm --> partials, closures, high order functions etc. etc. - Scripting paradigm --> shell control, os related functions etc.. - Async ops support --> asyncio
  • 6. Reading files with open("file", "r") as input: data = input.read() import csv def read_csv(): with open('data.csv', 'r') as francesco: data = csv.reader(francesco, delimiter=';') for element in data: print(element[1]) read_csv()
  • 7. Make data talk from collections import Counter import statistics splitted_chunks = data.split() print("Data lenght: %s" %len(data)) print("Chunks numbers: %s" %len(splitted_chunks)) print("Unique chunks: %s" %len(set(splitted_chunks))) print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks))) print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks))) print("Frequency distribution: %s" % sorted(filter(lambda item: item[1] > 5, Counter(splitted_chunks).items()), key=lambda item: item[1]))
  • 8. NLTK - tokenization - stemming - tagging - parsing - semantic reasoning - classification
  • 9. Tokenizing from nltk import word_tokenize tokens = word_tokenize(data) from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True) s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' tw_tokens = tokenizer.tokenize(s1) print(tw_tokens)
  • 10. Frequency distribution from nltk.book import FreqDist fdist1 = FreqDist(splitted_chunks) most_common = fdist1.most_common(50) fdist1.plot(50, cumulative=True) fdist1.plot(10) print("Max frequency key: %s" %fdist1.max()) print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"]) print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
  • 11. Cleaning data from nltk.corpus import stopwords def remove_stopword(word): return word not in words import string words = stopwords.words('italian') lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks)) print("Chunks lenght %s" %len(lowered_chunks)) clean_chunks = list(filter(remove_stopword, splitted_chunks)) print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks)) clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks)) print("Cleaned chunks (without punctuation and stopwords) lenght: %s" %len(clean_chunks)) from nltk.book import FreqDist fdist1 = FreqDist(clean_chunks) most_common = fdist1.most_common(50)
  • 12. Stemming from nltk.stem.porter import * from nltk.stem.snowball import * stemmer = PorterStemmer() stemmer.stem(“activities") available_langs = SnowballStemmer.languages sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True) print(sn_stemmer.stem("ordenador")) from nltk.stem.lancaster import * LancasterStemmer().stem("activities")
  • 13. Custom ngrams finder def find_and_analyze_ngrams(tagged_sent): chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): yield probable_ngram
  • 14. Classifying data def __get_elements_for_classification(self, lfeats, train_number, classifying=True): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): if classifying: train_feats.extend([(feat, label) for feat in feats]) else: cutoff = train_number * len(feats)/10 train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]]) nb_classifier = NaiveBayesClassifier.train(train_feats) return train_feats, test_feats, nb_classifier
  • 15. Pointwise Mutual Information PMI(X = x, Y = y) = log p(X = x, Y = y) p(X = x)p(Y = y)
  • 16. Measure PMI - Read from csv - Preprocess data (tokenize, lower, remove stopwords, punctuation) - Find frequency distribution for unigrams - Find frequency distribution for bigrams - Compute PMI via implemented function - Let NLTK sort bigrams by PMI metric - Write result to CSV file
  • 17. Read data import nltk from nltk.corpus import stopwords import string import random from itertools import chain import math import csv import time def read_data(): """ Read data 'libe by line'""" with open('data.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: yield row
  • 18. Preprocess def preprocess(data): """ Preprocess data, filtering out stopwords, punctuation and lowering all splitted tokens :param data: the string data to be processed """ italian_stopwords = stopwords.words('italian') splitted_chunks = data.split() lowered_chunks = (item.lower() for item in splitted_chunks) chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in string.punctuation) chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in italian_stopwords) return list(chunks_without_stopwords)
  • 19. Find N-Grams FREQUENCY_TRESHOLD = 2 def find_bigrams(splitted_chunks): """ Find bigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks) bigrams.apply_freq_filter(FREQUENCY_TRESHOLD) return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()} def find_unigrams(splitted_chunks): """ Find unigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ unigrams = nltk.FreqDist(splitted_chunks) return {unigram: freq for unigram, freq in unigrams.items() if freq > FREQUENCY_TRESHOLD - 1}
  • 20. Compute PMI def pmi(word1, word2, unigram_freq, bigram_freq): """ Find PMI measure :param word1: the first word :param word2: the second word :param unigram_freq: the unigram frequency container :param bigram_freq: the bigram frequency container """ prob_word1 = unigram_freq[word1] / sum(unigram_freq.values()) prob_word2 = unigram_freq[word2] / sum(unigram_freq.values()) prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values()) a = prob_word1_word2/prob_word1*prob_word2 return round(math.log(a,2),2)
  • 21. Write result to CSV def write_data(result): """ Write result to CSV file :param result: the list to be written to csv file """ with open("result.csv", "a") as output: writer = csv.writer(output, delimiter='*') for row in result: writer.writerow(row)