Basic NLP with Python
and NLTK
Bruni Francesco (@brunifrancesco)
Download the original iPython notebook @
https://github.com/brunifrancesco/nltk_base.git
Python
- Programming language
- Multi-paradigm
- Easy to learn
- Suitable for multiple needs
- Multiple implementations, a ton of useful libraries
Basic Python
import random
a_number = 1
a_string = "Python rocks!"
a_list = ["1", "2", "3"]
a_dict = {"film":"Pulp fiction", "francesco": "Python"}
print(a_dict.values())
a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1}
print(len(a_dict_of_list["key"]))
a_tuple = ("Goodfellas", "Kill Bill",)
a_list.append(4)
Creating functions
def super_function(number):
return number * 2
def factorial(n):
if n == 0: return 1
else: return n*factorial(n-1)
double = lambda item: item * 2
predicate = lambda item: item > 3
assert super_function(3) == 6
assert factorial(3) == 6
assert double(3) == 6
assert list(filter(predicate, [1,2,5,3])) == [5]
And much more
- Object oriented paradigm --> classes, metaclasses etc. etc.
- Functional programming paradigm --> partials, closures, high
order functions etc. etc.
- Scripting paradigm --> shell control, os related functions etc..
- Async ops support --> asyncio
Reading files
with open("file", "r") as input:
data = input.read()
import csv
def read_csv():
with open('data.csv', 'r') as francesco:
data = csv.reader(francesco, delimiter=';')
for element in data:
print(element[1])
read_csv()
Make data talk
from collections import Counter
import statistics
splitted_chunks = data.split()
print("Data lenght: %s" %len(data))
print("Chunks numbers: %s" %len(splitted_chunks))
print("Unique chunks: %s" %len(set(splitted_chunks)))
print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks)))
print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks)))
print("Frequency distribution: %s" %
sorted(filter(lambda item: item[1] > 5,
Counter(splitted_chunks).items()), key=lambda item: item[1]))
NLTK
- tokenization
- stemming
- tagging
- parsing
- semantic reasoning
- classification
Tokenizing
from nltk import word_tokenize
tokens = word_tokenize(data)
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True)
s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
tw_tokens = tokenizer.tokenize(s1)
print(tw_tokens)
Frequency distribution
from nltk.book import FreqDist
fdist1 = FreqDist(splitted_chunks)
most_common = fdist1.most_common(50)
fdist1.plot(50, cumulative=True)
fdist1.plot(10)
print("Max frequency key: %s" %fdist1.max())
print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"])
print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
Cleaning data
from nltk.corpus import stopwords
def remove_stopword(word):
return word not in words
import string
words = stopwords.words('italian')
lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks))
print("Chunks lenght %s" %len(lowered_chunks))
clean_chunks = list(filter(remove_stopword, splitted_chunks))
print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks))
clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks))
print("Cleaned chunks (without punctuation and stopwords) lenght: %s"
%len(clean_chunks))
from nltk.book import FreqDist
fdist1 = FreqDist(clean_chunks)
most_common = fdist1.most_common(50)
Stemming
from nltk.stem.porter import *
from nltk.stem.snowball import *
stemmer = PorterStemmer()
stemmer.stem(“activities")
available_langs = SnowballStemmer.languages
sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True)
print(sn_stemmer.stem("ordenador"))
from nltk.stem.lancaster import *
LancasterStemmer().stem("activities")
Custom ngrams finder
def find_and_analyze_ngrams(tagged_sent):
chunker = RegexpParser(CHUNK_RULE)
tree = chunker.parse(tagged_sent)
for item in self.__leaves(tree):
if not item == tagged_sent:
probable_ngram = ' '.join(self.__stemmer.stem(
word.lower()) for (word, pos) in item
)
if self.__evaluate_polarity_ngram(probable_ngram):
yield probable_ngram
Classifying data
def __get_elements_for_classification(self, lfeats, train_number, classifying=True):
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
if classifying:
train_feats.extend([(feat, label) for feat in feats])
else:
cutoff = train_number * len(feats)/10
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
nb_classifier = NaiveBayesClassifier.train(train_feats)
return train_feats, test_feats, nb_classifier
Pointwise Mutual
Information
PMI(X = x, Y = y) = log
p(X = x, Y = y)
p(X = x)p(Y = y)
Measure PMI
- Read from csv
- Preprocess data (tokenize, lower, remove stopwords, punctuation)
- Find frequency distribution for unigrams
- Find frequency distribution for bigrams
- Compute PMI via implemented function
- Let NLTK sort bigrams by PMI metric
- Write result to CSV file
Read data
import nltk
from nltk.corpus import stopwords
import string
import random
from itertools import chain
import math
import csv
import time
def read_data():
"""
Read data 'libe by line'"""
with open('data.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
yield row
Preprocess
def preprocess(data):
"""
Preprocess data, filtering out stopwords, punctuation and lowering
all splitted tokens
:param data: the string data to be processed
"""
italian_stopwords = stopwords.words('italian')
splitted_chunks = data.split()
lowered_chunks = (item.lower() for item in splitted_chunks)
chunks_without_punctuation = (chunk for chunk in lowered_chunks
if chunk not in string.punctuation)
chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation
if chunk not in italian_stopwords)
return list(chunks_without_stopwords)
Find N-Grams
FREQUENCY_TRESHOLD = 2
def find_bigrams(splitted_chunks):
"""
Find bigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks)
bigrams.apply_freq_filter(FREQUENCY_TRESHOLD)
return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()}
def find_unigrams(splitted_chunks):
"""
Find unigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
unigrams = nltk.FreqDist(splitted_chunks)
return {unigram: freq for unigram, freq in unigrams.items()
if freq > FREQUENCY_TRESHOLD - 1}
Compute PMI
def pmi(word1, word2, unigram_freq, bigram_freq):
"""
Find PMI measure
:param word1: the first word
:param word2: the second word
:param unigram_freq: the unigram frequency container
:param bigram_freq: the bigram frequency container
"""
prob_word1 = unigram_freq[word1] / sum(unigram_freq.values())
prob_word2 = unigram_freq[word2] / sum(unigram_freq.values())
prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values())
a = prob_word1_word2/prob_word1*prob_word2
return round(math.log(a,2),2)
Write result to CSV
def write_data(result):
"""
Write result to CSV file
:param result: the list to be written to csv file
"""
with open("result.csv", "a") as output:
writer = csv.writer(output, delimiter='*')
for row in result:
writer.writerow(row)
Happy coding :)

Basic NLP with Python and NLTK

  • 1.
    Basic NLP withPython and NLTK Bruni Francesco (@brunifrancesco) Download the original iPython notebook @ https://github.com/brunifrancesco/nltk_base.git
  • 2.
    Python - Programming language -Multi-paradigm - Easy to learn - Suitable for multiple needs - Multiple implementations, a ton of useful libraries
  • 3.
    Basic Python import random a_number= 1 a_string = "Python rocks!" a_list = ["1", "2", "3"] a_dict = {"film":"Pulp fiction", "francesco": "Python"} print(a_dict.values()) a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1} print(len(a_dict_of_list["key"])) a_tuple = ("Goodfellas", "Kill Bill",) a_list.append(4)
  • 4.
    Creating functions def super_function(number): returnnumber * 2 def factorial(n): if n == 0: return 1 else: return n*factorial(n-1) double = lambda item: item * 2 predicate = lambda item: item > 3 assert super_function(3) == 6 assert factorial(3) == 6 assert double(3) == 6 assert list(filter(predicate, [1,2,5,3])) == [5]
  • 5.
    And much more -Object oriented paradigm --> classes, metaclasses etc. etc. - Functional programming paradigm --> partials, closures, high order functions etc. etc. - Scripting paradigm --> shell control, os related functions etc.. - Async ops support --> asyncio
  • 6.
    Reading files with open("file","r") as input: data = input.read() import csv def read_csv(): with open('data.csv', 'r') as francesco: data = csv.reader(francesco, delimiter=';') for element in data: print(element[1]) read_csv()
  • 7.
    Make data talk fromcollections import Counter import statistics splitted_chunks = data.split() print("Data lenght: %s" %len(data)) print("Chunks numbers: %s" %len(splitted_chunks)) print("Unique chunks: %s" %len(set(splitted_chunks))) print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks))) print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks))) print("Frequency distribution: %s" % sorted(filter(lambda item: item[1] > 5, Counter(splitted_chunks).items()), key=lambda item: item[1]))
  • 8.
    NLTK - tokenization - stemming -tagging - parsing - semantic reasoning - classification
  • 9.
    Tokenizing from nltk importword_tokenize tokens = word_tokenize(data) from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True) s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' tw_tokens = tokenizer.tokenize(s1) print(tw_tokens)
  • 10.
    Frequency distribution from nltk.bookimport FreqDist fdist1 = FreqDist(splitted_chunks) most_common = fdist1.most_common(50) fdist1.plot(50, cumulative=True) fdist1.plot(10) print("Max frequency key: %s" %fdist1.max()) print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"]) print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))
  • 11.
    Cleaning data from nltk.corpusimport stopwords def remove_stopword(word): return word not in words import string words = stopwords.words('italian') lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks)) print("Chunks lenght %s" %len(lowered_chunks)) clean_chunks = list(filter(remove_stopword, splitted_chunks)) print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks)) clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks)) print("Cleaned chunks (without punctuation and stopwords) lenght: %s" %len(clean_chunks)) from nltk.book import FreqDist fdist1 = FreqDist(clean_chunks) most_common = fdist1.most_common(50)
  • 12.
    Stemming from nltk.stem.porter import* from nltk.stem.snowball import * stemmer = PorterStemmer() stemmer.stem(“activities") available_langs = SnowballStemmer.languages sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True) print(sn_stemmer.stem("ordenador")) from nltk.stem.lancaster import * LancasterStemmer().stem("activities")
  • 13.
    Custom ngrams finder deffind_and_analyze_ngrams(tagged_sent): chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): yield probable_ngram
  • 14.
    Classifying data def __get_elements_for_classification(self,lfeats, train_number, classifying=True): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): if classifying: train_feats.extend([(feat, label) for feat in feats]) else: cutoff = train_number * len(feats)/10 train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]]) nb_classifier = NaiveBayesClassifier.train(train_feats) return train_feats, test_feats, nb_classifier
  • 15.
    Pointwise Mutual Information PMI(X =x, Y = y) = log p(X = x, Y = y) p(X = x)p(Y = y)
  • 16.
    Measure PMI - Readfrom csv - Preprocess data (tokenize, lower, remove stopwords, punctuation) - Find frequency distribution for unigrams - Find frequency distribution for bigrams - Compute PMI via implemented function - Let NLTK sort bigrams by PMI metric - Write result to CSV file
  • 17.
    Read data import nltk fromnltk.corpus import stopwords import string import random from itertools import chain import math import csv import time def read_data(): """ Read data 'libe by line'""" with open('data.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: yield row
  • 18.
    Preprocess def preprocess(data): """ Preprocess data,filtering out stopwords, punctuation and lowering all splitted tokens :param data: the string data to be processed """ italian_stopwords = stopwords.words('italian') splitted_chunks = data.split() lowered_chunks = (item.lower() for item in splitted_chunks) chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in string.punctuation) chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in italian_stopwords) return list(chunks_without_stopwords)
  • 19.
    Find N-Grams FREQUENCY_TRESHOLD =2 def find_bigrams(splitted_chunks): """ Find bigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks) bigrams.apply_freq_filter(FREQUENCY_TRESHOLD) return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()} def find_unigrams(splitted_chunks): """ Find unigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ unigrams = nltk.FreqDist(splitted_chunks) return {unigram: freq for unigram, freq in unigrams.items() if freq > FREQUENCY_TRESHOLD - 1}
  • 20.
    Compute PMI def pmi(word1,word2, unigram_freq, bigram_freq): """ Find PMI measure :param word1: the first word :param word2: the second word :param unigram_freq: the unigram frequency container :param bigram_freq: the bigram frequency container """ prob_word1 = unigram_freq[word1] / sum(unigram_freq.values()) prob_word2 = unigram_freq[word2] / sum(unigram_freq.values()) prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values()) a = prob_word1_word2/prob_word1*prob_word2 return round(math.log(a,2),2)
  • 21.
    Write result toCSV def write_data(result): """ Write result to CSV file :param result: the list to be written to csv file """ with open("result.csv", "a") as output: writer = csv.writer(output, delimiter='*') for row in result: writer.writerow(row)
  • 22.