# -*- coding: utf-8 -*-
"""
Created on Thu Mar 9 22:26:54 2017
@author: Virat Bhandari
"""
import urllib
from bs4 import BeautifulSoup
import re
url = "https://www.bloomberg.com/politics/articles/2017-03-09/u-k-s-johnson-
says-vast-brexit-bill-would-be-not-reasonable"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = 'n'.join(chunk for chunk in chunks if chunk)
print(text)
###################################Text Blob
from textblob import TextBlob
opinion = TextBlob(text)
opinion.sentiment
opinion.tags
###############################################3
##Removing Noise########################
noise_list =
["Bloomberg","to","for","will","be","for","and","of","the","our","in","is", "a",
"this","that","we",",","xa0","#","/",".","+","__________________________","/n"]
def _remove_noise(input_text):
words = input_text.split()
noise_free_words = [word for word in words if word not in noise_list]
noise_free_text = " ".join(noise_free_words)
return noise_free_text
updated_text = _remove_noise(text)
##Remove via Regex
def _remove_regex(updated_text, regex_pattern):
urls = re.finditer(regex_pattern, updated_text)
for i in urls:
updated_text = re.sub(i.group().strip(), '', updated_text)
return updated_text
regex_pattern = "#[w]*"
_remove_regex(updated_text, regex_pattern)
cleaned = " ".join(re.findall('[A-Z][^A-Z]*', updated_text))
#####Stemming and Lemmatization
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
updated_text = lem.lemmatize(updated_text)
from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()
updated_text= stem.stem(updated_text)
#from textblob import Word
#w= Word(updated_text)
#w_lem = w.lemmatize()
#w_lem=TextBlob(w_lem)
##POS Tagging
from nltk import word_tokenize, pos_tag
tokens = word_tokenize(updated_text)
print (pos_tag(tokens))
#####
import gensim
import gensim.corpora as cp
# Creating the term dictionary of our corpus, where every unique term is
assigned an index.
dictionary = cp.Dictionary([updated_text.split()])
# Converting list of documents (corpus) into Document Term Matrix using
dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(word) for word in [updated_text.split()]]
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
# Results
print(ldamodel.print_topics())
#Creating N GRAMS here N = 2
def generate_ngrams(text, n):
words = text.split()
output = []
for i in range(len(words)-n+1):
output.append(words[i:i+n])
return output
generate_ngrams(updated_text, 2)
# Statistics
# TF - IDF
from sklearn.feature_extraction.text import TfidfVectorizer
obj = TfidfVectorizer()
corpus = updated_text.split()
X = obj.fit_transform(corpus)
print (X)
# TEXT STATISTICS
from textstat.textstat import textstat
if __name__ == '__main__':
test_data = updated_text
print (textstat.flesch_reading_ease(test_data))
print (textstat.smog_index(test_data))
print (textstat.flesch_kincaid_grade(test_data))
print (textstat.coleman_liau_index(test_data))
print (textstat.automated_readability_index(test_data))
print (textstat.dale_chall_readability_score(test_data))
print (textstat.difficult_words(test_data))
print (textstat.linsear_write_formula(test_data))
print (textstat.gunning_fog(test_data))
print (textstat.text_standard(test_data))
#Word Embedding
from gensim.models import Word2Vec
#sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],
['machine', 'learning'], ['deep', 'learning']]
# train the model on your corpus
sent = [corpus.tokenize(s) for s in corpus]
model = Word2Vec(updated_text.split(), min_count = 10)
print (model.similarity('Brexit', 'EU'))
##Machine Learning
##Here is a code that uses naive bayes classifier using text blob library (built
on top of nltk).
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob
training_corpus = [
('Brexit EU London Borris.', 'Politics'),
("Theresa May is doing bad politics", 'Politics'),
('He is my badest enemy!', 'Children'),
('My management is poor.', 'Managers'),
('I love this burger.', 'Children'),
('This is an brilliant place!', 'Tourism'),
('I feel very good about these dates.', 'Tourism'),
('This is my best visit to any country.', 'Politics'),
("What an awesome view", 'Tourism'),
('I do not like this dish', 'Children')]
test_corpus = [
("Brexit is not going to be good", 'Politics'),
("I feel brilliant!", 'Tourism'),
('Gary is a friend of mine.', 'Children'),
("I can't believe I'm doing this, Borris.", 'Politics'),
('The date was good.', 'Tourism'),
('I do not enjoy London', 'Politics')]
model = NBC(training_corpus)
print(model.classify("Brexit Borris"))
print(model.classify("badest management poor burger"))
print(model.classify("Brilliant view feel very good"))
## Text Matching / Similarity
def levenshtein(s1,s2):
if len(s1) > len(s2):
s1,s2 = s2,s1
distances = range(len(s1) + 1)
for index2,char2 in enumerate(s2):
newDistances = [index2+1]
for index1,char1 in enumerate(s1):
if char1 == char2:
newDistances.append(distances[index1])
else:
newDistances.append(1 + min((distances[index1],
distances[index1+1], newDistances[-1])))
distances = newDistances
return distances[-1]
print(levenshtein("analyze","analyse"))
## Cosin Similarity
## By this we can check if any 2 articles are same or not
import math
from collections import Counter
def get_cosine(vec1, vec2):
common = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in common])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = text.split()
return Counter(words)
text1 = updated_text
text2 = 'article on analytics vidhya is about natural language processing'
vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)
cosine = get_cosine(vector1, vector2)

Nlp

  • 1.
    # -*- coding:utf-8 -*- """ Created on Thu Mar 9 22:26:54 2017 @author: Virat Bhandari """ import urllib from bs4 import BeautifulSoup import re url = "https://www.bloomberg.com/politics/articles/2017-03-09/u-k-s-johnson- says-vast-brexit-bill-would-be-not-reasonable" html = urllib.request.urlopen(url).read() soup = BeautifulSoup(html) # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = 'n'.join(chunk for chunk in chunks if chunk) print(text) ###################################Text Blob from textblob import TextBlob opinion = TextBlob(text) opinion.sentiment opinion.tags ###############################################3 ##Removing Noise######################## noise_list = ["Bloomberg","to","for","will","be","for","and","of","the","our","in","is", "a", "this","that","we",",","xa0","#","/",".","+","__________________________","/n"] def _remove_noise(input_text): words = input_text.split() noise_free_words = [word for word in words if word not in noise_list] noise_free_text = " ".join(noise_free_words) return noise_free_text updated_text = _remove_noise(text) ##Remove via Regex def _remove_regex(updated_text, regex_pattern): urls = re.finditer(regex_pattern, updated_text) for i in urls: updated_text = re.sub(i.group().strip(), '', updated_text) return updated_text regex_pattern = "#[w]*" _remove_regex(updated_text, regex_pattern) cleaned = " ".join(re.findall('[A-Z][^A-Z]*', updated_text))
  • 2.
    #####Stemming and Lemmatization fromnltk.stem.wordnet import WordNetLemmatizer lem = WordNetLemmatizer() updated_text = lem.lemmatize(updated_text) from nltk.stem.porter import PorterStemmer stem = PorterStemmer() updated_text= stem.stem(updated_text) #from textblob import Word #w= Word(updated_text) #w_lem = w.lemmatize() #w_lem=TextBlob(w_lem) ##POS Tagging from nltk import word_tokenize, pos_tag tokens = word_tokenize(updated_text) print (pos_tag(tokens)) ##### import gensim import gensim.corpora as cp # Creating the term dictionary of our corpus, where every unique term is assigned an index. dictionary = cp.Dictionary([updated_text.split()]) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(word) for word in [updated_text.split()]] # Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel # Running and Training LDA model on the document term matrix ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) # Results print(ldamodel.print_topics()) #Creating N GRAMS here N = 2 def generate_ngrams(text, n): words = text.split() output = [] for i in range(len(words)-n+1): output.append(words[i:i+n]) return output generate_ngrams(updated_text, 2) # Statistics # TF - IDF from sklearn.feature_extraction.text import TfidfVectorizer obj = TfidfVectorizer() corpus = updated_text.split() X = obj.fit_transform(corpus) print (X) # TEXT STATISTICS
  • 3.
    from textstat.textstat importtextstat if __name__ == '__main__': test_data = updated_text print (textstat.flesch_reading_ease(test_data)) print (textstat.smog_index(test_data)) print (textstat.flesch_kincaid_grade(test_data)) print (textstat.coleman_liau_index(test_data)) print (textstat.automated_readability_index(test_data)) print (textstat.dale_chall_readability_score(test_data)) print (textstat.difficult_words(test_data)) print (textstat.linsear_write_formula(test_data)) print (textstat.gunning_fog(test_data)) print (textstat.text_standard(test_data)) #Word Embedding from gensim.models import Word2Vec #sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'], ['machine', 'learning'], ['deep', 'learning']] # train the model on your corpus sent = [corpus.tokenize(s) for s in corpus] model = Word2Vec(updated_text.split(), min_count = 10) print (model.similarity('Brexit', 'EU')) ##Machine Learning ##Here is a code that uses naive bayes classifier using text blob library (built on top of nltk). from textblob.classifiers import NaiveBayesClassifier as NBC from textblob import TextBlob training_corpus = [ ('Brexit EU London Borris.', 'Politics'), ("Theresa May is doing bad politics", 'Politics'), ('He is my badest enemy!', 'Children'), ('My management is poor.', 'Managers'), ('I love this burger.', 'Children'), ('This is an brilliant place!', 'Tourism'), ('I feel very good about these dates.', 'Tourism'), ('This is my best visit to any country.', 'Politics'), ("What an awesome view", 'Tourism'), ('I do not like this dish', 'Children')] test_corpus = [ ("Brexit is not going to be good", 'Politics'), ("I feel brilliant!", 'Tourism'), ('Gary is a friend of mine.', 'Children'), ("I can't believe I'm doing this, Borris.", 'Politics'), ('The date was good.', 'Tourism'), ('I do not enjoy London', 'Politics')] model = NBC(training_corpus) print(model.classify("Brexit Borris")) print(model.classify("badest management poor burger")) print(model.classify("Brilliant view feel very good")) ## Text Matching / Similarity def levenshtein(s1,s2): if len(s1) > len(s2): s1,s2 = s2,s1 distances = range(len(s1) + 1) for index2,char2 in enumerate(s2):
  • 4.
    newDistances = [index2+1] forindex1,char1 in enumerate(s1): if char1 == char2: newDistances.append(distances[index1]) else: newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1]))) distances = newDistances return distances[-1] print(levenshtein("analyze","analyse")) ## Cosin Similarity ## By this we can check if any 2 articles are same or not import math from collections import Counter def get_cosine(vec1, vec2): common = set(vec1.keys()) & set(vec2.keys()) numerator = sum([vec1[x] * vec2[x] for x in common]) sum1 = sum([vec1[x]**2 for x in vec1.keys()]) sum2 = sum([vec2[x]**2 for x in vec2.keys()]) denominator = math.sqrt(sum1) * math.sqrt(sum2) if not denominator: return 0.0 else: return float(numerator) / denominator def text_to_vector(text): words = text.split() return Counter(words) text1 = updated_text text2 = 'article on analytics vidhya is about natural language processing' vector1 = text_to_vector(text1) vector2 = text_to_vector(text2) cosine = get_cosine(vector1, vector2)