Basic NLP with Python and NLTK

Basic NLP with Python
and NLTK
Bruni Francesco (@brunifrancesco)
Download the original iPython notebook @
https://github.com/brunifrancesco/nltk_base.git

Python
- Programming language
- Multi-paradigm
- Easy to learn
- Suitable for multiple needs
- Multiple implementations, a ton of useful libraries

Basic Python
import random
a_number = 1
a_string = "Python rocks!"
a_list = ["1", "2", "3"]
a_dict = {"film":"Pulp fiction", "francesco": "Python"}
print(a_dict.values())
a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1}
print(len(a_dict_of_list["key"]))
a_tuple = ("Goodfellas", "Kill Bill",)
a_list.append(4)

Creating functions
def super_function(number):
return number * 2
def factorial(n):
if n == 0: return 1
else: return n*factorial(n-1)
double = lambda item: item * 2
predicate = lambda item: item > 3
assert super_function(3) == 6
assert factorial(3) == 6
assert double(3) == 6
assert list(filter(predicate, [1,2,5,3])) == [5]

And much more
- Object oriented paradigm --> classes, metaclasses etc. etc.
- Functional programming paradigm --> partials, closures, high
order functions etc. etc.
- Scripting paradigm --> shell control, os related functions etc..
- Async ops support --> asyncio

Reading ﬁles
with open("file", "r") as input:
data = input.read()
import csv
def read_csv():
with open('data.csv', 'r') as francesco:
data = csv.reader(francesco, delimiter=';')
for element in data:
print(element[1])
read_csv()

Make data talk
from collections import Counter
import statistics
splitted_chunks = data.split()
print("Data lenght: %s" %len(data))
print("Chunks numbers: %s" %len(splitted_chunks))
print("Unique chunks: %s" %len(set(splitted_chunks)))
print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks)))
print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks)))
print("Frequency distribution: %s" %
sorted(filter(lambda item: item[1] > 5,
Counter(splitted_chunks).items()), key=lambda item: item[1]))

NLTK
- tokenization
- stemming
- tagging
- parsing
- semantic reasoning
- classiﬁcation

Tokenizing
from nltk import word_tokenize
tokens = word_tokenize(data)
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True)
s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
tw_tokens = tokenizer.tokenize(s1)
print(tw_tokens)

Frequency distribution
from nltk.book import FreqDist
fdist1 = FreqDist(splitted_chunks)
most_common = fdist1.most_common(50)
fdist1.plot(50, cumulative=True)
fdist1.plot(10)
print("Max frequency key: %s" %fdist1.max())
print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"])
print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))

Cleaning data
from nltk.corpus import stopwords
def remove_stopword(word):
return word not in words
import string
words = stopwords.words('italian')
lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks))
print("Chunks lenght %s" %len(lowered_chunks))
clean_chunks = list(filter(remove_stopword, splitted_chunks))
print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks))
clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks))
print("Cleaned chunks (without punctuation and stopwords) lenght: %s"
%len(clean_chunks))
from nltk.book import FreqDist
fdist1 = FreqDist(clean_chunks)
most_common = fdist1.most_common(50)

Stemming
from nltk.stem.porter import *
from nltk.stem.snowball import *
stemmer = PorterStemmer()
stemmer.stem(“activities")
available_langs = SnowballStemmer.languages
sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True)
print(sn_stemmer.stem("ordenador"))
from nltk.stem.lancaster import *
LancasterStemmer().stem("activities")

Custom ngrams ﬁnder
def find_and_analyze_ngrams(tagged_sent):
chunker = RegexpParser(CHUNK_RULE)
tree = chunker.parse(tagged_sent)
for item in self.__leaves(tree):
if not item == tagged_sent:
probable_ngram = ' '.join(self.__stemmer.stem(
word.lower()) for (word, pos) in item
)
if self.__evaluate_polarity_ngram(probable_ngram):
yield probable_ngram

Classifying data
def __get_elements_for_classification(self, lfeats, train_number, classifying=True):
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
if classifying:
train_feats.extend([(feat, label) for feat in feats])
else:
cutoff = train_number * len(feats)/10
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
nb_classifier = NaiveBayesClassifier.train(train_feats)
return train_feats, test_feats, nb_classifier

Pointwise Mutual
Information
PMI(X = x, Y = y) = log
p(X = x, Y = y)
p(X = x)p(Y = y)

Measure PMI
- Read from csv
- Preprocess data (tokenize, lower, remove stopwords, punctuation)
- Find frequency distribution for unigrams
- Find frequency distribution for bigrams
- Compute PMI via implemented function
- Let NLTK sort bigrams by PMI metric
- Write result to CSV ﬁle

Read data
import nltk
from nltk.corpus import stopwords
import string
import random
from itertools import chain
import math
import csv
import time
def read_data():
"""
Read data 'libe by line'"""
with open('data.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
yield row

Preprocess
def preprocess(data):
"""
Preprocess data, filtering out stopwords, punctuation and lowering
all splitted tokens
:param data: the string data to be processed
"""
italian_stopwords = stopwords.words('italian')
splitted_chunks = data.split()
lowered_chunks = (item.lower() for item in splitted_chunks)
chunks_without_punctuation = (chunk for chunk in lowered_chunks
if chunk not in string.punctuation)
chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation
if chunk not in italian_stopwords)
return list(chunks_without_stopwords)

Find N-Grams
FREQUENCY_TRESHOLD = 2
def find_bigrams(splitted_chunks):
"""
Find bigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks)
bigrams.apply_freq_filter(FREQUENCY_TRESHOLD)
return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()}
def find_unigrams(splitted_chunks):
"""
Find unigrams and filter them by frequency threshold
:param splitted_chunks: a list of chunks
"""
unigrams = nltk.FreqDist(splitted_chunks)
return {unigram: freq for unigram, freq in unigrams.items()
if freq > FREQUENCY_TRESHOLD - 1}

Compute PMI
def pmi(word1, word2, unigram_freq, bigram_freq):
"""
Find PMI measure
:param word1: the first word
:param word2: the second word
:param unigram_freq: the unigram frequency container
:param bigram_freq: the bigram frequency container
"""
prob_word1 = unigram_freq[word1] / sum(unigram_freq.values())
prob_word2 = unigram_freq[word2] / sum(unigram_freq.values())
prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values())
a = prob_word1_word2/prob_word1*prob_word2
return round(math.log(a,2),2)

Write result to CSV
def write_data(result):
"""
Write result to CSV file
:param result: the list to be written to csv file
"""
with open("result.csv", "a") as output:
writer = csv.writer(output, delimiter='*')
for row in result:
writer.writerow(row)

Basic NLP with Python and NLTK

More Related Content

What's hot

Viewers also liked

Similar to Basic NLP with Python and NLTK

Recently uploaded

Basic NLP with Python and NLTK