SlideShare a Scribd company logo
import pandas as pd
d = pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
dvec = vectorizer.fit_transform(d['CONTENT'])
dshuf = d.sample(frac=1)
d_train = dshuf[:300]
d_test = dshuf[300:]
d_train_att = vectorizer.fit_transform(d_train['CONTENT']) # fit bag-of-words on training set
d_test_att = vectorizer.transform(d_test['CONTENT']) # reuse on testing set
d_train_label = d_train['CLASS']
d_test_label = d_test['CLASS']
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=80)
clf.fit(d_train_att, d_train_label)
print(clf.score(d_test_att, d_test_label))
from sklearn.metrics import confusion_matrix
pred_labels = clf.predict(d_test_att)
print(confusion_matrix(d_test_label, pred_labels))
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, d_train_att, d_train_label, cv=5)
# show average score and +/- two standard deviations away (covering 95% of scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# load all datasets and combine them
d = pd.concat([pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube04-Eminem.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")])
dshuf = d.sample(frac=1)
d_content = dshuf['CONTENT']
d_label = dshuf['CLASS']
# set up a pipeline
from sklearn.pipeline import Pipeline, make_pipeline
pipeline = Pipeline([
('bag-of-words', CountVectorizer()),
('random forest', RandomForestClassifier()),
])
pipeline
pipeline.fit(d_content[:1500],d_label[:1500])
print(pipeline.score(d_content[1500:], d_label[1500:]))
print(pipeline.predict(["what a neat video!"]))
print(pipeline.predict(["plz subscribe to my channel"]))
scores = cross_val_score(pipeline, d_content, d_label, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# add tfidf
from sklearn.feature_extraction.text import TfidfTransformer
pipeline2 = make_pipeline(CountVectorizer(),
TfidfTransformer(norm=None),
RandomForestClassifier())
scores = cross_val_score(pipeline2, d_content, d_label, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# parameter search
parameters = {
'countvectorizer__max_features': (None, 1000, 2000),
'countvectorizer__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
'countvectorizer__stop_words': ('english', None),
'tfidftransformer__use_idf': (True, False), # effectively turn on/off tfidf
'randomforestclassifier__n_estimators': (20, 50, 100)
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1)
grid_search.fit(d_content, d_label)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("t%s: %r" % (param_name, best_parameters[param_name]))
Detect spam comments youtube videos and app store reviews

More Related Content

Similar to Detect spam comments youtube videos and app store reviews

Assignment 5.2.pdf
Assignment 5.2.pdfAssignment 5.2.pdf
Assignment 5.2.pdf
dash41
 
Assignment 6.2a.pdf
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdf
dash41
 
Testing My Patience
Testing My PatienceTesting My Patience
Testing My Patience
Adam Lowry
 
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
MattU5mLambertq
 
Baby Steps to Machine Learning at DevFest Lagos 2019
Baby Steps to Machine Learning at DevFest Lagos 2019Baby Steps to Machine Learning at DevFest Lagos 2019
Baby Steps to Machine Learning at DevFest Lagos 2019
Robert John
 
Python Unit Test
Python Unit TestPython Unit Test
Python Unit TestDavid Xie
 
Grid search (parameter tuning)
Grid search (parameter tuning)Grid search (parameter tuning)
Grid search (parameter tuning)
Akhilesh Joshi
 
K fold
K foldK fold
Naive application of Machine Learning to Software Development
Naive application of Machine Learning to Software DevelopmentNaive application of Machine Learning to Software Development
Naive application of Machine Learning to Software Development
Andriy Khavryuchenko
 
Creating and Maintaining WordPress Plugins
Creating and Maintaining WordPress PluginsCreating and Maintaining WordPress Plugins
Creating and Maintaining WordPress PluginsMark Jaquith
 
Python Ireland Nov 2010 Talk: Unit Testing
Python Ireland Nov 2010 Talk: Unit TestingPython Ireland Nov 2010 Talk: Unit Testing
Python Ireland Nov 2010 Talk: Unit Testing
Python Ireland
 
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdfHello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Ian0J2Bondo
 
Classification examp
Classification exampClassification examp
Classification exampRyan Hong
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
Fwdays
 
Ml all programs
Ml all programsMl all programs
Ml all programs
adnaanmohamed
 
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
GordonF2XPatersonh
 
Python programming : Classes objects
Python programming : Classes objectsPython programming : Classes objects
Python programming : Classes objects
Emertxe Information Technologies Pvt Ltd
 
Machine Learning Algorithms
Machine Learning AlgorithmsMachine Learning Algorithms
Machine Learning Algorithms
Hichem Felouat
 

Similar to Detect spam comments youtube videos and app store reviews (20)

Assignment 5.2.pdf
Assignment 5.2.pdfAssignment 5.2.pdf
Assignment 5.2.pdf
 
Assignment 6.2a.pdf
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdf
 
Testing My Patience
Testing My PatienceTesting My Patience
Testing My Patience
 
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
 
Baby Steps to Machine Learning at DevFest Lagos 2019
Baby Steps to Machine Learning at DevFest Lagos 2019Baby Steps to Machine Learning at DevFest Lagos 2019
Baby Steps to Machine Learning at DevFest Lagos 2019
 
Python Unit Test
Python Unit TestPython Unit Test
Python Unit Test
 
Django (Web Konferencia 2009)
Django (Web Konferencia 2009)Django (Web Konferencia 2009)
Django (Web Konferencia 2009)
 
Grid search (parameter tuning)
Grid search (parameter tuning)Grid search (parameter tuning)
Grid search (parameter tuning)
 
K fold
K foldK fold
K fold
 
Naive application of Machine Learning to Software Development
Naive application of Machine Learning to Software DevelopmentNaive application of Machine Learning to Software Development
Naive application of Machine Learning to Software Development
 
Unit test
Unit testUnit test
Unit test
 
Creating and Maintaining WordPress Plugins
Creating and Maintaining WordPress PluginsCreating and Maintaining WordPress Plugins
Creating and Maintaining WordPress Plugins
 
Python Ireland Nov 2010 Talk: Unit Testing
Python Ireland Nov 2010 Talk: Unit TestingPython Ireland Nov 2010 Talk: Unit Testing
Python Ireland Nov 2010 Talk: Unit Testing
 
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdfHello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
 
Classification examp
Classification exampClassification examp
Classification examp
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
Ml all programs
Ml all programsMl all programs
Ml all programs
 
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
 
Python programming : Classes objects
Python programming : Classes objectsPython programming : Classes objects
Python programming : Classes objects
 
Machine Learning Algorithms
Machine Learning AlgorithmsMachine Learning Algorithms
Machine Learning Algorithms
 

More from Mamoon Ismail Khalid

REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
Mamoon Ismail Khalid
 
Network Traffic Adaptable Image Codec - A solution to make streaming faster
Network Traffic Adaptable Image Codec - A solution to make streaming fasterNetwork Traffic Adaptable Image Codec - A solution to make streaming faster
Network Traffic Adaptable Image Codec - A solution to make streaming faster
Mamoon Ismail Khalid
 
Hospital Management and Inventory Control Solution for Public Hospitals in De...
Hospital Management and Inventory Control Solution for Public Hospitals in De...Hospital Management and Inventory Control Solution for Public Hospitals in De...
Hospital Management and Inventory Control Solution for Public Hospitals in De...
Mamoon Ismail Khalid
 
ATLAS - Product Requirement Document.pdf
ATLAS - Product Requirement Document.pdfATLAS - Product Requirement Document.pdf
ATLAS - Product Requirement Document.pdf
Mamoon Ismail Khalid
 
T(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdfT(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdf
Mamoon Ismail Khalid
 
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
Joint3DShapeMatching  - a fast approach to 3D model matching using MatchALS 3...Joint3DShapeMatching  - a fast approach to 3D model matching using MatchALS 3...
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
Mamoon Ismail Khalid
 
Golf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction SystemGolf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction System
Mamoon Ismail Khalid
 
24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf
Mamoon Ismail Khalid
 
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
Mamoon Ismail Khalid
 
PyTorch to detect Humans Eating Food.pdf
PyTorch to detect Humans Eating Food.pdfPyTorch to detect Humans Eating Food.pdf
PyTorch to detect Humans Eating Food.pdf
Mamoon Ismail Khalid
 
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
Future of agriculture agriculture - technology is a necessity in 2020 and beyondFuture of agriculture agriculture - technology is a necessity in 2020 and beyond
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
Mamoon Ismail Khalid
 
Nano mos25
Nano mos25Nano mos25
Real estate in blockchain (2)
Real estate in blockchain (2)Real estate in blockchain (2)
Real estate in blockchain (2)
Mamoon Ismail Khalid
 
Cohort analysis saa s (1)
Cohort analysis saa s (1)Cohort analysis saa s (1)
Cohort analysis saa s (1)
Mamoon Ismail Khalid
 
ISA backed technology skills platform
ISA backed technology skills platformISA backed technology skills platform
ISA backed technology skills platform
Mamoon Ismail Khalid
 
Start up valuation methods
Start up valuation methodsStart up valuation methods
Start up valuation methods
Mamoon Ismail Khalid
 
Analysis mvp factory
Analysis mvp factoryAnalysis mvp factory
Analysis mvp factory
Mamoon Ismail Khalid
 
Start Up deal/interaction management workflow
Start Up deal/interaction management workflowStart Up deal/interaction management workflow
Start Up deal/interaction management workflow
Mamoon Ismail Khalid
 
Hunter.io scraper
Hunter.io scraperHunter.io scraper
Hunter.io scraper
Mamoon Ismail Khalid
 
Interact with information on largest 100 btc accounts ( python skeleton code)
Interact with information on largest 100 btc accounts ( python skeleton code)Interact with information on largest 100 btc accounts ( python skeleton code)
Interact with information on largest 100 btc accounts ( python skeleton code)
Mamoon Ismail Khalid
 

More from Mamoon Ismail Khalid (20)

REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
 
Network Traffic Adaptable Image Codec - A solution to make streaming faster
Network Traffic Adaptable Image Codec - A solution to make streaming fasterNetwork Traffic Adaptable Image Codec - A solution to make streaming faster
Network Traffic Adaptable Image Codec - A solution to make streaming faster
 
Hospital Management and Inventory Control Solution for Public Hospitals in De...
Hospital Management and Inventory Control Solution for Public Hospitals in De...Hospital Management and Inventory Control Solution for Public Hospitals in De...
Hospital Management and Inventory Control Solution for Public Hospitals in De...
 
ATLAS - Product Requirement Document.pdf
ATLAS - Product Requirement Document.pdfATLAS - Product Requirement Document.pdf
ATLAS - Product Requirement Document.pdf
 
T(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdfT(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdf
 
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
Joint3DShapeMatching  - a fast approach to 3D model matching using MatchALS 3...Joint3DShapeMatching  - a fast approach to 3D model matching using MatchALS 3...
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
 
Golf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction SystemGolf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction System
 
24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf
 
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
 
PyTorch to detect Humans Eating Food.pdf
PyTorch to detect Humans Eating Food.pdfPyTorch to detect Humans Eating Food.pdf
PyTorch to detect Humans Eating Food.pdf
 
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
Future of agriculture agriculture - technology is a necessity in 2020 and beyondFuture of agriculture agriculture - technology is a necessity in 2020 and beyond
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
 
Nano mos25
Nano mos25Nano mos25
Nano mos25
 
Real estate in blockchain (2)
Real estate in blockchain (2)Real estate in blockchain (2)
Real estate in blockchain (2)
 
Cohort analysis saa s (1)
Cohort analysis saa s (1)Cohort analysis saa s (1)
Cohort analysis saa s (1)
 
ISA backed technology skills platform
ISA backed technology skills platformISA backed technology skills platform
ISA backed technology skills platform
 
Start up valuation methods
Start up valuation methodsStart up valuation methods
Start up valuation methods
 
Analysis mvp factory
Analysis mvp factoryAnalysis mvp factory
Analysis mvp factory
 
Start Up deal/interaction management workflow
Start Up deal/interaction management workflowStart Up deal/interaction management workflow
Start Up deal/interaction management workflow
 
Hunter.io scraper
Hunter.io scraperHunter.io scraper
Hunter.io scraper
 
Interact with information on largest 100 btc accounts ( python skeleton code)
Interact with information on largest 100 btc accounts ( python skeleton code)Interact with information on largest 100 btc accounts ( python skeleton code)
Interact with information on largest 100 btc accounts ( python skeleton code)
 

Recently uploaded

Elevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object CalisthenicsElevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object Calisthenics
Dorra BARTAGUIZ
 
Introduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - CybersecurityIntroduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - Cybersecurity
mikeeftimakis1
 
Leading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdfLeading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdf
OnBoard
 
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
James Anderson
 
FIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdfFIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance
 
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 previewState of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
Prayukth K V
 
RESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for studentsRESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for students
KAMESHS29
 
Enhancing Performance with Globus and the Science DMZ
Enhancing Performance with Globus and the Science DMZEnhancing Performance with Globus and the Science DMZ
Enhancing Performance with Globus and the Science DMZ
Globus
 
UiPath Community Day Dubai: AI at Work..
UiPath Community Day Dubai: AI at Work..UiPath Community Day Dubai: AI at Work..
UiPath Community Day Dubai: AI at Work..
UiPathCommunity
 
GraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge GraphGraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge Graph
Guy Korland
 
Secstrike : Reverse Engineering & Pwnable tools for CTF.pptx
Secstrike : Reverse Engineering & Pwnable tools for CTF.pptxSecstrike : Reverse Engineering & Pwnable tools for CTF.pptx
Secstrike : Reverse Engineering & Pwnable tools for CTF.pptx
nkrafacyberclub
 
PHP Frameworks: I want to break free (IPC Berlin 2024)
PHP Frameworks: I want to break free (IPC Berlin 2024)PHP Frameworks: I want to break free (IPC Berlin 2024)
PHP Frameworks: I want to break free (IPC Berlin 2024)
Ralf Eggert
 
FIDO Alliance Osaka Seminar: FIDO Security Aspects.pdf
FIDO Alliance Osaka Seminar: FIDO Security Aspects.pdfFIDO Alliance Osaka Seminar: FIDO Security Aspects.pdf
FIDO Alliance Osaka Seminar: FIDO Security Aspects.pdf
FIDO Alliance
 
Accelerate your Kubernetes clusters with Varnish Caching
Accelerate your Kubernetes clusters with Varnish CachingAccelerate your Kubernetes clusters with Varnish Caching
Accelerate your Kubernetes clusters with Varnish Caching
Thijs Feryn
 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
Kari Kakkonen
 
Transcript: Selling digital books in 2024: Insights from industry leaders - T...
Transcript: Selling digital books in 2024: Insights from industry leaders - T...Transcript: Selling digital books in 2024: Insights from industry leaders - T...
Transcript: Selling digital books in 2024: Insights from industry leaders - T...
BookNet Canada
 
Securing your Kubernetes cluster_ a step-by-step guide to success !
Securing your Kubernetes cluster_ a step-by-step guide to success !Securing your Kubernetes cluster_ a step-by-step guide to success !
Securing your Kubernetes cluster_ a step-by-step guide to success !
KatiaHIMEUR1
 
Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™
Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™
Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™
UiPathCommunity
 
Free Complete Python - A step towards Data Science
Free Complete Python - A step towards Data ScienceFree Complete Python - A step towards Data Science
Free Complete Python - A step towards Data Science
RinaMondal9
 
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
James Anderson
 

Recently uploaded (20)

Elevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object CalisthenicsElevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object Calisthenics
 
Introduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - CybersecurityIntroduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - Cybersecurity
 
Leading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdfLeading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdf
 
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
 
FIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdfFIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdf
 
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 previewState of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
 
RESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for studentsRESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for students
 
Enhancing Performance with Globus and the Science DMZ
Enhancing Performance with Globus and the Science DMZEnhancing Performance with Globus and the Science DMZ
Enhancing Performance with Globus and the Science DMZ
 
UiPath Community Day Dubai: AI at Work..
UiPath Community Day Dubai: AI at Work..UiPath Community Day Dubai: AI at Work..
UiPath Community Day Dubai: AI at Work..
 
GraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge GraphGraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge Graph
 
Secstrike : Reverse Engineering & Pwnable tools for CTF.pptx
Secstrike : Reverse Engineering & Pwnable tools for CTF.pptxSecstrike : Reverse Engineering & Pwnable tools for CTF.pptx
Secstrike : Reverse Engineering & Pwnable tools for CTF.pptx
 
PHP Frameworks: I want to break free (IPC Berlin 2024)
PHP Frameworks: I want to break free (IPC Berlin 2024)PHP Frameworks: I want to break free (IPC Berlin 2024)
PHP Frameworks: I want to break free (IPC Berlin 2024)
 
FIDO Alliance Osaka Seminar: FIDO Security Aspects.pdf
FIDO Alliance Osaka Seminar: FIDO Security Aspects.pdfFIDO Alliance Osaka Seminar: FIDO Security Aspects.pdf
FIDO Alliance Osaka Seminar: FIDO Security Aspects.pdf
 
Accelerate your Kubernetes clusters with Varnish Caching
Accelerate your Kubernetes clusters with Varnish CachingAccelerate your Kubernetes clusters with Varnish Caching
Accelerate your Kubernetes clusters with Varnish Caching
 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
 
Transcript: Selling digital books in 2024: Insights from industry leaders - T...
Transcript: Selling digital books in 2024: Insights from industry leaders - T...Transcript: Selling digital books in 2024: Insights from industry leaders - T...
Transcript: Selling digital books in 2024: Insights from industry leaders - T...
 
Securing your Kubernetes cluster_ a step-by-step guide to success !
Securing your Kubernetes cluster_ a step-by-step guide to success !Securing your Kubernetes cluster_ a step-by-step guide to success !
Securing your Kubernetes cluster_ a step-by-step guide to success !
 
Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™
Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™
Le nuove frontiere dell'AI nell'RPA con UiPath Autopilot™
 
Free Complete Python - A step towards Data Science
Free Complete Python - A step towards Data ScienceFree Complete Python - A step towards Data Science
Free Complete Python - A step towards Data Science
 
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
 

Detect spam comments youtube videos and app store reviews

  • 1. import pandas as pd d = pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv") from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() dvec = vectorizer.fit_transform(d['CONTENT']) dshuf = d.sample(frac=1) d_train = dshuf[:300] d_test = dshuf[300:] d_train_att = vectorizer.fit_transform(d_train['CONTENT']) # fit bag-of-words on training set d_test_att = vectorizer.transform(d_test['CONTENT']) # reuse on testing set d_train_label = d_train['CLASS'] d_test_label = d_test['CLASS'] from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=80) clf.fit(d_train_att, d_train_label) print(clf.score(d_test_att, d_test_label)) from sklearn.metrics import confusion_matrix pred_labels = clf.predict(d_test_att) print(confusion_matrix(d_test_label, pred_labels)) from sklearn.model_selection import cross_val_score scores = cross_val_score(clf, d_train_att, d_train_label, cv=5)
  • 2. # show average score and +/- two standard deviations away (covering 95% of scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # load all datasets and combine them d = pd.concat([pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube04-Eminem.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")]) dshuf = d.sample(frac=1) d_content = dshuf['CONTENT'] d_label = dshuf['CLASS'] # set up a pipeline from sklearn.pipeline import Pipeline, make_pipeline pipeline = Pipeline([ ('bag-of-words', CountVectorizer()), ('random forest', RandomForestClassifier()), ]) pipeline pipeline.fit(d_content[:1500],d_label[:1500]) print(pipeline.score(d_content[1500:], d_label[1500:])) print(pipeline.predict(["what a neat video!"])) print(pipeline.predict(["plz subscribe to my channel"]))
  • 3. scores = cross_val_score(pipeline, d_content, d_label, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # add tfidf from sklearn.feature_extraction.text import TfidfTransformer pipeline2 = make_pipeline(CountVectorizer(), TfidfTransformer(norm=None), RandomForestClassifier()) scores = cross_val_score(pipeline2, d_content, d_label, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # parameter search parameters = { 'countvectorizer__max_features': (None, 1000, 2000), 'countvectorizer__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams 'countvectorizer__stop_words': ('english', None), 'tfidftransformer__use_idf': (True, False), # effectively turn on/off tfidf 'randomforestclassifier__n_estimators': (20, 50, 100) } from sklearn.model_selection import GridSearchCV grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1) grid_search.fit(d_content, d_label) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("t%s: %r" % (param_name, best_parameters[param_name]))