
Python - BeautifulSoup bs4
Eueung Mulyana
http://eueung.github.io/python/bs4
Python CodeLabs | Attribution-ShareAlike CC BY-SA
1 / 20
Agenda
bs4 Basics
Easy Web Scraping
2 / 20
 Basics
3 / 20
Example #1
html_doc="""
<html><head><title>TheDormouse'sstory</title></head>
<body>
<pclass="title"><b>TheDormouse'sstory</b></p>
<pclass="story">Onceuponatimetherewerethreelittlesisters;andtheirnameswere
<ahref="http://example.com/elsie"class="sister"id="link1">Elsie</a>,
<ahref="http://example.com/lacie"class="sister"id="link2">Lacie</a>and
<ahref="http://example.com/tillie"class="sister"id="link3">Tillie</a>;
andtheylivedatthebottomofawell.</p>
<pclass="story">...</p>
"""
frombs4importBeautifulSoup
soup=BeautifulSoup(html_doc,'html.parser')
print(soup.prettify())
<html>
<head>
<title>
TheDormouse'sstory
</title>
</head>
<body>
<pclass="title">
<b>
TheDormouse'sstory
</b>
</p>
<pclass="story">
Onceuponatimetherewerethreelittlesisters;andtheir
<aclass="sister"href="http://example.com/elsie"id="link1
<aclass="sister"href="http://example.com/lacie"id="link2
<aclass="sister"href="http://example.com/tillie"id="link
andtheylivedatthebottomofawell.
</p>
<pclass="story">... </p>
</body>
</html>
4 / 20
Example #2
forlinkinsoup.find_all('a'):
print(link.get('href'))
print(soup.get_text())
printsoup.title #<title>TheDormouse'sstory</title>
printsoup.title.name #u'title'
printsoup.title.string #u'TheDormouse'sstory'
printsoup.title.parent.name#u'head'
printsoup.p #<pclass="title"><b>TheDormouse'sstory</b></p>
printsoup.p['class'] #u'title'
printsoup.a #<aclass="sister"href="http://example.com/elsie"id="link1">Elsie</a>
printsoup.find(id="link3") #<aclass="sister"href="http://example.com/tillie"id="link3">Tillie</a>
printsoup.find_all('a') #[<aclass="sister"href="http://example.com/elsie"id="link1">Elsie</a>,
# <aclass="sister"href="http://example.com/lacie"id="link2">Lacie</a>,
# <aclass="sister"href="http://example.com/tillie"id="link3">Tillie</a>]
<title>TheDormouse'sstory</title>
title
TheDormouse'sstory
head
<pclass="title"><b>TheDormouse'sstory</b></p>
[u'title']
<aclass="sister"href="http://example.com/elsie"id="link1"
<aclass="sister"href="http://example.com/tillie"id="link3"
[<aclass="sister"href="http://example.com/elsie"id="link1"
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie
TheDormouse'sstory
TheDormouse'sstory
Onceuponatimetherewerethreelittlesisters;andtheirna
Elsie,
Lacieand
Tillie;
andtheylivedatthebottomofawell.
...
5 / 20
Example #3
head_tag=soup.head
printhead_tag #<head><title>TheDormouse'sstory</title></head>
printhead_tag.contents #[<title>TheDormouse'sstory</title>]
printhead_tag.string #u'TheDormouse'sstory'
forchildinhead_tag.descendants: #<title>TheDormouse'sstory</title>
print(child) #TheDormouse'sstory
#-------
title_tag=head_tag.contents[0]
printtitle_tag #<title>TheDormouse'sstory</title>
printtitle_tag.contents#[u'TheDormouse'sstory']
printtitle_tag.string #u'TheDormouse'sstory'
text=title_tag.contents[0]
#printtext.contents #AttributeError:'NavigableString'objecthasnoattribute'contents'
forchildintitle_tag.children:
print(child) #TheDormouse'sstory
<head><title>TheDormouse'sstory</title></head>
[<title>TheDormouse'sstory</title>]
TheDormouse'sstory
<title>TheDormouse'sstory</title>
TheDormouse'sstory
#-------
<title>TheDormouse'sstory</title>
[u"TheDormouse'sstory"]
TheDormouse'sstory
TheDormouse'sstory
6 / 20
Example #4
#printsoup.contents
printlen(soup.contents) #1
printsoup.contents[0].name #u'html'
printlen(list(soup.children)) #1
printlen(list(soup.descendants))#25
#--------
print(soup.html.string)#None
#forstringinsoup.strings:
#print(repr(string))
forstringinsoup.stripped_strings:
print(repr(string))
1
html
1
25
None
#--------
u"TheDormouse'sstory"
u"TheDormouse'sstory"
u'Onceuponatimetherewerethreelittlesisters;andtheir
u'Elsie'
u','
u'Lacie'
u'and'
u'Tillie'
u';nandtheylivedatthebottomofawell.'
u'...'
7 / 20
Example #5
soup=BeautifulSoup('<bclass="boldest">Extremelybold</b>'
tag=soup.b
printtype(tag)
printtag.name #u'b'
tag.name="blockquote"
printtag #<blockquoteclass="boldest">Extremelybold</blockquote>
printtag['class'] #u'boldest'
printtag.attrs #{u'class':u'boldest'}
tag['class']='verybold'
tag['id']=1
printtag #<blockquoteclass="verybold"id="1">Extremelybold</blockquote>
#-------------
deltag['class']
deltag['id']
printtag #<blockquote>Extremelybold</blockquote>
#printtag['class'] #KeyError:'class'
print(tag.get('class'))#None
<class'bs4.element.Tag'>
b
<blockquoteclass="boldest">Extremelybold</blockquote>
['boldest']
{'class':['boldest']}
<blockquoteclass="verybold"id="1">Extremelybold</blockquote
#-------------
<blockquote>Extremelybold</blockquote>
None
8 / 20
Example #6
class_soup=BeautifulSoup('<pclass="bodystrikeout"></p>'
printclass_soup.p['class'] #["body","strikeout"]
class_soup=BeautifulSoup('<pclass="body"></p>')
printclass_soup.p['class'] #["body"]
id_soup=BeautifulSoup('<pid="myid"></p>')
printid_soup.p['id'] #'myid'
#----------
rel_soup=BeautifulSoup('<p>Backtothe<arel="index">homepage</a></p>'
printrel_soup.a['rel'] #['index']
rel_soup.a['rel']=['index','contents']
print(rel_soup.p) #<p>Backtothe<arel="indexcontents">homepage</a></p>
#----------
xml_soup=BeautifulSoup('<pclass="bodystrikeout"></p>',
printxml_soup.p['class'] #u'bodystrikeout'
['body','strikeout']
['body']
myid
#----------
['index']
<p>Backtothe<arel="indexcontents">homepage</a></p>
#----------
bodystrikeout
When you turn a tag back into a string, multiple attribute
values are consolidated.
If you parse a document as XML, there are no multi-
valued attributes.
The relattribute specifies the relationship between the
current document and the linked document (Only used if
the hrefattribute is present).
9 / 20
 Easy Web Scraping
@miguelgrinberg
10 / 20
PyCon 2014 - Page Index
11 / 20
PyCon 2014 - Page Details
12 / 20
PyCon 2014 - YouTube
13 / 20
Test #1
importrequests
importbs4
response=requests.get('http://pyvideo.org/category/50/pycon-us-2014'
soup=bs4.BeautifulSoup(response.text)
#atags=soup.select('div#video-summary-contenta[href^=/video]')
links=[a.attrs.get('href')forainsoup.select('div#video-summary-contenta[href^=/video]'
printlinks[1:5]
['/video/2676/2d3d-graphics-with-python-on-mobile-platforms'
links=[a.attrs.get('href')forainsoup.select('div#video-summary-contentstronga[href^=/video]'
pycon-scraper.py #1
importbs4
importre
importrequests
importargparse
frommultiprocessing.poolimportThreadPoolasPool
root_url='http://pyvideo.org'
index_url=root_url+'/category/50/pycon-us-2014'
defget_video_page_urls():
response=requests.get(index_url)
soup=bs4.BeautifulSoup(response.text)
allvids=[a.attrs.get('href')forainsoup.select('div#v
returnallvids[0:10]
14 / 20
Test #2
root_url='http://pyvideo.org'
video_page_url='/video/2668/writing-restful-web-services-with-flask'
video_data={}
response=requests.get(root_url+video_page_url)
soup=bs4.BeautifulSoup(response.text)
video_data['title']=soup.select('div#videoboxh3')[0].get_text()
video_data['speakers']=[a.get_text()forainsoup.select(
video_data['youtube_url']=soup.select('div#sidebara[href^=http://www.youtube.com]'
printvideo_data
{'speakers':[u'MiguelGrinberg'],'youtube_url':u'http://www.youtube.com/watch?v=px_vg9Far1Y'
pycon-scraper.py #2
defget_video_data(video_page_url):
video_data={}
response=requests.get(root_url+video_page_url)
soup=bs4.BeautifulSoup(response.text)
video_data['title']=soup.select('div#videoboxh3')[0].ge
video_data['speakers']=[a.get_text()forainsoup.selec
video_data['youtube_url']=soup.select('div#sidebara[hre
#...
returnvideo_data
15 / 20
Test #3
importrequests
importbs4
importre
response=requests.get('https://www.youtube.com/watch?v=px_vg9Far1Y'
soup=bs4.BeautifulSoup(response.text)
video_data['views']=int(re.sub('[^0-9]','',soup.select('.watch-view-count'
video_data['likes']=int(re.sub('[^0-9]','',soup.select('.like-button-renderer-like-buttonspan.yt-uix-button-content'
video_data['dislikes']=int(re.sub('[^0-9]','',soup.select(
printvideo_data
{'speakers':[u'MiguelGrinberg'],'views':11908,'title':
pycon-scraper.py #3
defget_video_data(video_page_url):
#...
#initializecounters
video_data['views']=0
video_data['likes']=0
video_data['dislikes']=0
try:
response=requests.get(video_data['youtube_url'],hea
soup=bs4.BeautifulSoup(response.text)
video_data['views']=int(re.sub('[^0-9]','',soup.se
video_data['likes']=int(re.sub('[^0-9]','',soup.sel
video_data['dislikes']=int(re.sub('[^0-9]','',soup.
except:
#someorallofthecounterscouldnotbescraped
pass
returnvideo_data
16 / 20
pycon-scraper.py #4
Notes
defshow_video_stats(options):
pool=Pool(8)
video_page_urls=get_video_page_urls()
results=pool.map(get_video_data,video_page_urls)
#-----
func(video):returnvideo[options.sort]
defparse_args():
parser=argparse.ArgumentParser(description='ShowPyCon2014videostatistics.'
parser.add_argument('--sort',metavar='FIELD',choices=[
parser.add_argument('--max',metavar='MAX',type=int,help=
parser.add_argument('--csv',action='store_true',default=
parser.add_argument('--workers',type=int,default=8,help=
returnparser.parse_args()
#ex:pythonpycon-scraper.py--sortviews--max25--workers8
defshow_video_stats(options):
pool=Pool(options.workers)
video_page_urls=get_video_page_urls()
results=sorted(pool.map(get_video_data,video_page_urls)
printlen(results)
max=options.max
ifmaxisNoneormax>len(results):
max=len(results)
ifoptions.csv:
print(u'"title","speakers","views","likes","dislikes"
else:
print(u'Views +1 -1Title(Speakers)')
foriinrange(max):
ifoptions.csv:
print(u'"{0}","{1}",{2},{3},{4}'.format(results[i]
else:
print(u'{0:5d}{1:3d}{2:3d}{3}({4})'.format(res
if__name__=='__main__':
show_video_stats(parse_args())
17 / 20
$> python pycon-scraper.py --sort views
Views +1 -1Title(Speakers)
8608 80 10AnalyzingRapLyricswithPython(JulieLavoie)
7302 28 1BuildingtheApp(MikeBayer)
4603 25 32D/3DgraphicswithPythononmobileplatforms(NikoSkrypnik)
4056 30 0DesigningDjango'sMigrations(AndrewGodwin)
3923 41 0CheapHelicoptersInMyLivingRoom(NedJacksonLovely)
3407 36 2AScenicDrivethroughtheDjangoRequest-ResponseCycle(DanLanger)
3343 33 0AdvancedtechniquesforWebfunctionaltesting(JulienPhalip)
1598 28 0Dataintensivebiologyinthecloud:instrumentingALLthethings(C.TitusBrown)
941 8 0DeliverYourSoftwareInAnEnvelope(AugieFackler,NathanielManista)
751 1 0Closingaddress-PyCon2014(2014/04/13)()
18 / 20
References
1. Beautiful Soup Documentation
2. Easy Web Scraping with Python
3. Generate statistics about PyCon 2014 videos
19 / 20

END
Eueung Mulyana
http://eueung.github.io/python/bs4
Python CodeLabs | Attribution-ShareAlike CC BY-SA
20 / 20

Python beautiful soup - bs4