SlideShare a Scribd company logo
1 of 47
Download to read offline
Ruby
Robots

Daniel Cukier
@danicuki
                http://www.flickr.com/photos/flysi/183272970
Relatives


• spiders
• crawlers
• bots
Why robot?
http://www.flickr.com/photos/nhankamer/5016628611
require 'anemone'

Anemone.crawl(url) do |anemone|
  anemone.on_every_page do |page|
      puts page.url
  end
end                 http://www.cantora.mus.br/
                           http://www.cantora.mus.br/fotos
                           http://www.cantora.mus.br/?locale=en
                           http://www.cantora.mus.br/?locale=pt-BR
                           http://www.cantora.mus.br/musicas
                           http://www.cantora.mus.br/videos
                           http://www.cantora.mus.br/agenda
                           http://www.cantora.mus.br/novidades
                           http://www.cantora.mus.br/musicas/baixar
                           http://www.cantora.mus.br/visitors/baixar
                           http://www.cantora.mus.br/social
                           http://www.cantora.mus.br/fotos?locale=pt-BR
                           http://www.cantora.mus.br/musicas?locale=en
                           http://www.cantora.mus.br/fotos?locale=en
XPath
<html>
...
<div class="bla">
  <a>legal</a>
</div>
...
</html>




html_doc = Nokogiri::HTML(html)
info = html_doc.xpath(
  "//div[@class='bla']/a")
info.text
=> legal
XPath
<table id="super">   >> html_doc = Nokogiri::HTML(html)
  <tr>               >> info = html_doc.xpath(
    <td>L1C1</td>      "//table[@id='super']/tr")
    <td>L1C2</td>    >> info.size
                     => 3
  </tr>
  <tr>
                     >> info
    <td>L2C1</td>    => legal
    <td>L2C2</td>
  </tr>              >> info[0].xpath("td").size
  <tr>               => 2
    <td>L3C1</td>
    <td>L3C2</td>    >> info[2].xpath("td")[1].text
  </tr>              => "L3C2"
</table>
rest-client
ET
G

http://www.flickr.com/photos/amortize/766738216
http://www.flickr.com/photos/abbeychristine/223898960
Good bot

                                                /robots.txt


                                                User-agent: *
                                                Disallow:




http://www.flickr.com/photos/temily/5645585162
Ruby
Robots

Daniel Cukier
@danicuki
                http://www.flickr.com/photos/flysi/183272970
http://www.flickr.com/photos/nephelim/5632618462
maxRowsList=16
>>   body = RestClient.get(url)
 >>   json = JSON.parse(body)
 >>   content = json["Content"]
 >>   content.size
 =>   16
      AHA!!!
 http://.../artistas?maxRowsList=1600&filter=Recentes
 >>   body = RestClient.get(url)
 >>   json = JSON.parse(a)
 >>   content = json["Content"]
 >>   content.size
 =>   1600

http://.../artistas?maxRowsList=1600000&filter=Recentes
 >> content.size
 => 9154

       Bingo!!!
>> b["Content"].map {|c| c["ProfileUrl"]}
["caravella", "tomleite", "jeffersontavares", "rodrigoaraujo",
"jorgemendes", "bossapunk", "daviddepiro", "freetools", "ironia",
"tiagorosa", "outprofile", "lucianokoscky",
"bandateatraldecarona", "tlounge", "almanaque", "razzyoficial",
"cretinosecanalhas", "cincorios", "ninoantunes", "caiocorsalette",
"alinedelima", "thelio", "grupodomdesamba", "ladoz",
"alexandrepontes", "poeiradgua", "betimalu", "leonardobessa",
"kamaross", "marcusdocavaco", "atividadeinformal", "angelkeys",
"locojohn", "forcamusic", "tiaguinhoabreu", "marcelonegrao",
"jstonemghiphop", "uniaoglobal", "bandaefex", "severarock",
"manitu", "sasso", "kakka", "xsopretty", "belepoke", "caixaazul",
"wknd", "bandastarven", "bleiamusic", "3porcentoaocubo",
"lucianoterra", "hipnoia", "influencianegra", "bandaursamaior",
"mariafreitas", "jessejames", "vagnerrockxe", "stageo3",
"lemoneight", "innocence", "dinda", "marcelocapela",
"paulocamoeseoslusiadas", "magnussrock", "bandatheburk",
"mercantes", "bandaturnerock", "flaviasaolli", "tonysagga",
"thiagoponde", "centeio", "grupodeubranco", "bocadeleao",
"eusoueliascardan", "notoriaoficial", "planomasterrock", "rofgod",
"dreemonphc", "chicobrant", "osz", "bandalightspeed",
"cavernadenarnia", "sergiobenevenuto", "viniciusdeoliveira", ...]
email?
phone?
>> html = RestClient.get("http://.../robomacaco")
>> html_doc = Nokogiri::HTML(html)
>> info = html_doc.xpath("//span[@class='name']")
>> info.text
=> "robo-macaco@hotmail.com
RIO DE JANEIRO - RJ - Brasil
21 9675-0199
cookies



cookies = {}
c = "s_nr=12954999; s_v19=12978609471; ... __utmc=206845458"
cook = c.split(";").map {|i| i.strip.split("=")}
cook.each {|u| cookies[u[0]] = u[1]}

RestClient.get(url, :cookies => cookies)
Proxies
http://www.ip-adress.com/proxy_list
>> response = RestClient.get(url)
>> html_doc = Nokogiri::HTML(response)
>> table = html_doc.xpath("//table
[@class='proxylist']")
>> lines = table.children
>> lines.shift   # tira o cabeçalho
                     Text

        IP
>> lines[1].text
=> "208.52.144.55 document.write(":"+i+r+i+r)
anonymous proxy server-2 minutes ago United States"
<script type="text/javascript">
  z=5;i=8;x=4;l=1;o=9;q=6;n=3;u=2;k=7;r=0;
</script>
JAVASCRIPT
     =
   RUBY



     http://www.flickr.com/photos/drics/4266471776/
<script type="text/javascript">
       z=5;i=8;x=4;l=1;o=9;q=6;n=3;u=2;k=7;r=0;
     </script>


>>   script = html_doc.xpath("//script")[1]
>>   eval script.text
>>   z
=>   5
>>   i
=>   8
>> lines[1].text
=> "208.52.144.55 document.write(":"+i+r+i+r) anonymous
proxy server-2 minutes ago United States"


>> server = lines[1].text.split[0]
=> "208.52.144.55"


>>   digits = lines[1].text.split(")")[0].split("+")
=>   ["208.52.144.55document.write(":"", "i", "r", "i", "r"]
>>   digits.shift
>>   digits
=>   ["i", "r", "i", "r"]
>>   port = digits.map {|c| eval(c)}.join("")
=>   "8080"
                Voilà

RestClient.proxy = "http://#{server}:#{port}"
mechanize
agent = Mechanize.new
site = "http://www.cantora.mus.br"
page = agent.get("#{site}/baixar")
form = page.form
form['visitor[name]'] = 'daniel'
form['visitor[email]'] = "danicuki@gmail.com"
page = agent.submit(form)
tracks = page.links.select { |l| l.href =~ /track/ }
tracks.each do |t|
  file = agent.get("#{site}#{t})
  file.save
end
protection techniques




                     javascript
                  text as image
                        captcha
              don’t be ingenuous
captcha
prove you are not a robot




      YES you can!
3 steps

1. Download Image
2. filter image
3. run OCR software
scaling




http://www.flickr.com/photos/liquene/3330714590
clouds


$ knife ec2 server create
threads
   +
queues
Nessa vida de programador maluco
Me aparece cada situação
De repente um cliente, uma proposta bruta
Pra pegar de um site informação
Você tá louco, esse tipo de crime eu não faço
Se quiser tenho uns amigos lá do sul
Faz pra mim que eu te pago com essa jóia cool

Te dou um ruby
Pra você roubar
Com o seu robô

Quer fazer robô?
É só usar ruby
É só usar ruby
Pra fazer robô

                                http://www.flickr.com/photos/jobafunky/5572503988
Thank you




Daniel Cukier
@danicuki

More Related Content

What's hot

Mojolicious: what works and what doesn't
Mojolicious: what works and what doesn'tMojolicious: what works and what doesn't
Mojolicious: what works and what doesn'tCosimo Streppone
 
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍민태 김
 
PerlでWeb API入門
PerlでWeb API入門PerlでWeb API入門
PerlでWeb API入門Yusuke Wada
 
Java & Script ─ 清羽
Java & Script ─ 清羽Java & Script ─ 清羽
Java & Script ─ 清羽taobao.com
 
URL Resources
URL ResourcesURL Resources
URL Resourcestombecky
 
Token Based Authentication Systems
Token Based Authentication SystemsToken Based Authentication Systems
Token Based Authentication SystemsHüseyin BABAL
 
Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011Angelo van der Sijpt
 
R57php 1231677414471772-2
R57php 1231677414471772-2R57php 1231677414471772-2
R57php 1231677414471772-2ady36
 
The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)Francois Marier
 
Persona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwordsPersona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwordsFrancois Marier
 
Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)Oleg Zinchenko
 
Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性Hidenori Fujimura
 
Angular js活用事例:filydoc
Angular js活用事例:filydocAngular js活用事例:filydoc
Angular js活用事例:filydocKeiichi Kobayashi
 
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat GroupInterlat
 
Beyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPressBeyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPressJohn Eckman
 

What's hot (20)

Mojolicious: what works and what doesn't
Mojolicious: what works and what doesn'tMojolicious: what works and what doesn't
Mojolicious: what works and what doesn't
 
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
 
PerlでWeb API入門
PerlでWeb API入門PerlでWeb API入門
PerlでWeb API入門
 
Device deployment
Device deploymentDevice deployment
Device deployment
 
Java & Script ─ 清羽
Java & Script ─ 清羽Java & Script ─ 清羽
Java & Script ─ 清羽
 
URL Resources
URL ResourcesURL Resources
URL Resources
 
Token Based Authentication Systems
Token Based Authentication SystemsToken Based Authentication Systems
Token Based Authentication Systems
 
Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011
 
R57php 1231677414471772-2
R57php 1231677414471772-2R57php 1231677414471772-2
R57php 1231677414471772-2
 
Rails by example
Rails by exampleRails by example
Rails by example
 
Pecha Kucha
Pecha KuchaPecha Kucha
Pecha Kucha
 
The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)
 
Persona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwordsPersona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwords
 
Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)
 
Blog Hacks 2011
Blog Hacks 2011Blog Hacks 2011
Blog Hacks 2011
 
Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性
 
Angular js活用事例:filydoc
Angular js活用事例:filydocAngular js活用事例:filydoc
Angular js活用事例:filydoc
 
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
 
API Design
API DesignAPI Design
API Design
 
Beyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPressBeyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPress
 

Similar to Ruby Robots

Velocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and youVelocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and youPatrick Meenan
 
Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)Security B-Sides
 
HTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC EditionHTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC EditionXavier Mertens
 
Inspec one tool to rule them all
Inspec one tool to rule them allInspec one tool to rule them all
Inspec one tool to rule them allKimball Johnson
 
Log files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO OpportunitiesLog files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO OpportunitiesRobin Rozhon
 
Using HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech DaysUsing HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech DaysRobert Nyman
 
Illuminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 TutorialIlluminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 Tutorialmikel_maron
 
#NewMeetup Performance
#NewMeetup Performance#NewMeetup Performance
#NewMeetup PerformanceJustin Cataldo
 
Microformats: what are they and why do I care?
Microformats: what are they and why do I care?Microformats: what are they and why do I care?
Microformats: what are they and why do I care?adactio
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-pythonEric Ahn
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-pythonEric Ahn
 
20111014 mu me_html5
20111014 mu me_html520111014 mu me_html5
20111014 mu me_html5Erik Duval
 
Keep it simple web development stack
Keep it simple web development stackKeep it simple web development stack
Keep it simple web development stackEric Ahn
 

Similar to Ruby Robots (20)

Velocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and youVelocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and you
 
Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)
 
Seti 09
Seti 09Seti 09
Seti 09
 
HTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC EditionHTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC Edition
 
Inspec one tool to rule them all
Inspec one tool to rule them allInspec one tool to rule them all
Inspec one tool to rule them all
 
Log files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO OpportunitiesLog files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO Opportunities
 
Using HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech DaysUsing HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech Days
 
The Devil and HTML5
The Devil and HTML5The Devil and HTML5
The Devil and HTML5
 
Mume HTML5 Intro
Mume HTML5 IntroMume HTML5 Intro
Mume HTML5 Intro
 
Illuminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 TutorialIlluminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 Tutorial
 
ApacheCon 2005
ApacheCon 2005ApacheCon 2005
ApacheCon 2005
 
Jabber Bot
Jabber BotJabber Bot
Jabber Bot
 
Api
ApiApi
Api
 
#NewMeetup Performance
#NewMeetup Performance#NewMeetup Performance
#NewMeetup Performance
 
Microformats: what are they and why do I care?
Microformats: what are they and why do I care?Microformats: what are they and why do I care?
Microformats: what are they and why do I care?
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-python
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-python
 
CEI Email 3.14.03
CEI Email 3.14.03CEI Email 3.14.03
CEI Email 3.14.03
 
20111014 mu me_html5
20111014 mu me_html520111014 mu me_html5
20111014 mu me_html5
 
Keep it simple web development stack
Keep it simple web development stackKeep it simple web development stack
Keep it simple web development stack
 

More from Daniel Cukier

Solidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate TrainingSolidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate TrainingDaniel Cukier
 
Spring e Injeção de Dependência
Spring e Injeção de DependênciaSpring e Injeção de Dependência
Spring e Injeção de DependênciaDaniel Cukier
 
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...Daniel Cukier
 
Startup Communities: From Nascence to Maturity
Startup Communities: From Nascence to MaturityStartup Communities: From Nascence to Maturity
Startup Communities: From Nascence to MaturityDaniel Cukier
 
Technology Startups Ecosystem in China - Lessons to other ecosystems
Technology Startups  Ecosystem in China - Lessons to other ecosystemsTechnology Startups  Ecosystem in China - Lessons to other ecosystems
Technology Startups Ecosystem in China - Lessons to other ecosystemsDaniel Cukier
 
Software Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case StudySoftware Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case StudyDaniel Cukier
 
Maturity model for Startup Ecosystems
Maturity model for Startup EcosystemsMaturity model for Startup Ecosystems
Maturity model for Startup EcosystemsDaniel Cukier
 
Why Google Cloud is so special? Stories from a cloud user
Why Google Cloud is so special?  Stories from a cloud userWhy Google Cloud is so special?  Stories from a cloud user
Why Google Cloud is so special? Stories from a cloud userDaniel Cukier
 
Software Architectures for a Single Person Team
Software Architectures for a Single Person TeamSoftware Architectures for a Single Person Team
Software Architectures for a Single Person TeamDaniel Cukier
 
Introduction to Functional Programming with Scala
Introduction to Functional Programming with ScalaIntroduction to Functional Programming with Scala
Introduction to Functional Programming with ScalaDaniel Cukier
 
O dia a dia de uma Startup
O dia a dia de uma StartupO dia a dia de uma Startup
O dia a dia de uma StartupDaniel Cukier
 
Injeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com DublêsInjeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com DublêsDaniel Cukier
 
Selecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software EngineeringSelecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software EngineeringDaniel Cukier
 
Is Computer Science Science?
Is Computer Science Science?Is Computer Science Science?
Is Computer Science Science?Daniel Cukier
 
Better Science Through Art
Better Science Through ArtBetter Science Through Art
Better Science Through ArtDaniel Cukier
 
Designed as Designer
Designed as DesignerDesigned as Designer
Designed as DesignerDaniel Cukier
 
When Should You Consider Meta Architectures
When Should You Consider Meta ArchitecturesWhen Should You Consider Meta Architectures
When Should You Consider Meta ArchitecturesDaniel Cukier
 

More from Daniel Cukier (20)

Solidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate TrainingSolidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate Training
 
Spring e Injeção de Dependência
Spring e Injeção de DependênciaSpring e Injeção de Dependência
Spring e Injeção de Dependência
 
Pair programming
Pair programmingPair programming
Pair programming
 
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
 
Startup Communities: From Nascence to Maturity
Startup Communities: From Nascence to MaturityStartup Communities: From Nascence to Maturity
Startup Communities: From Nascence to Maturity
 
Technology Startups Ecosystem in China - Lessons to other ecosystems
Technology Startups  Ecosystem in China - Lessons to other ecosystemsTechnology Startups  Ecosystem in China - Lessons to other ecosystems
Technology Startups Ecosystem in China - Lessons to other ecosystems
 
Software Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case StudySoftware Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case Study
 
Maturity model for Startup Ecosystems
Maturity model for Startup EcosystemsMaturity model for Startup Ecosystems
Maturity model for Startup Ecosystems
 
Why Google Cloud is so special? Stories from a cloud user
Why Google Cloud is so special?  Stories from a cloud userWhy Google Cloud is so special?  Stories from a cloud user
Why Google Cloud is so special? Stories from a cloud user
 
Software Architectures for a Single Person Team
Software Architectures for a Single Person TeamSoftware Architectures for a Single Person Team
Software Architectures for a Single Person Team
 
Startup Communities
Startup CommunitiesStartup Communities
Startup Communities
 
Introduction to Functional Programming with Scala
Introduction to Functional Programming with ScalaIntroduction to Functional Programming with Scala
Introduction to Functional Programming with Scala
 
Play vs Rails
Play vs RailsPlay vs Rails
Play vs Rails
 
O dia a dia de uma Startup
O dia a dia de uma StartupO dia a dia de uma Startup
O dia a dia de uma Startup
 
Injeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com DublêsInjeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com Dublês
 
Selecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software EngineeringSelecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software Engineering
 
Is Computer Science Science?
Is Computer Science Science?Is Computer Science Science?
Is Computer Science Science?
 
Better Science Through Art
Better Science Through ArtBetter Science Through Art
Better Science Through Art
 
Designed as Designer
Designed as DesignerDesigned as Designer
Designed as Designer
 
When Should You Consider Meta Architectures
When Should You Consider Meta ArchitecturesWhen Should You Consider Meta Architectures
When Should You Consider Meta Architectures
 

Recently uploaded

Architecting Cloud Native Applications
Architecting Cloud Native ApplicationsArchitecting Cloud Native Applications
Architecting Cloud Native ApplicationsWSO2
 
DBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor PresentationDBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor PresentationDropbox
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxRustici Software
 
FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024The Digital Insurer
 
Platformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityPlatformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityWSO2
 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdfSandro Moreira
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfsudhanshuwaghmare1
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoffsammart93
 
WSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering DevelopersWSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering DevelopersWSO2
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...apidays
 
Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024Victor Rentea
 
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProduct Anonymous
 
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot ModelMcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot ModelDeepika Singh
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingEdi Saputra
 
Six Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal OntologySix Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal Ontologyjohnbeverley2021
 
Exploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusExploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusZilliz
 
Vector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptxVector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptxRemote DBA Services
 
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWEREMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWERMadyBayot
 

Recently uploaded (20)

Architecting Cloud Native Applications
Architecting Cloud Native ApplicationsArchitecting Cloud Native Applications
Architecting Cloud Native Applications
 
DBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor PresentationDBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor Presentation
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptx
 
FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024
 
Platformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityPlatformless Horizons for Digital Adaptability
Platformless Horizons for Digital Adaptability
 
Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..
 
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
WSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering DevelopersWSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering Developers
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...
 
Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024
 
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
 
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot ModelMcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
 
Six Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal OntologySix Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal Ontology
 
Exploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusExploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with Milvus
 
Vector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptxVector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptx
 
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWEREMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
 

Ruby Robots

  • 1. Ruby Robots Daniel Cukier @danicuki http://www.flickr.com/photos/flysi/183272970
  • 2.
  • 6. require 'anemone' Anemone.crawl(url) do |anemone| anemone.on_every_page do |page| puts page.url end end http://www.cantora.mus.br/ http://www.cantora.mus.br/fotos http://www.cantora.mus.br/?locale=en http://www.cantora.mus.br/?locale=pt-BR http://www.cantora.mus.br/musicas http://www.cantora.mus.br/videos http://www.cantora.mus.br/agenda http://www.cantora.mus.br/novidades http://www.cantora.mus.br/musicas/baixar http://www.cantora.mus.br/visitors/baixar http://www.cantora.mus.br/social http://www.cantora.mus.br/fotos?locale=pt-BR http://www.cantora.mus.br/musicas?locale=en http://www.cantora.mus.br/fotos?locale=en
  • 7.
  • 8. XPath <html> ... <div class="bla"> <a>legal</a> </div> ... </html> html_doc = Nokogiri::HTML(html) info = html_doc.xpath( "//div[@class='bla']/a") info.text => legal
  • 9. XPath <table id="super"> >> html_doc = Nokogiri::HTML(html) <tr> >> info = html_doc.xpath( <td>L1C1</td> "//table[@id='super']/tr") <td>L1C2</td> >> info.size => 3 </tr> <tr> >> info <td>L2C1</td> => legal <td>L2C2</td> </tr> >> info[0].xpath("td").size <tr> => 2 <td>L3C1</td> <td>L3C2</td> >> info[2].xpath("td")[1].text </tr> => "L3C2" </table>
  • 13. Good bot /robots.txt User-agent: * Disallow: http://www.flickr.com/photos/temily/5645585162
  • 14. Ruby Robots Daniel Cukier @danicuki http://www.flickr.com/photos/flysi/183272970
  • 16.
  • 18.
  • 19.
  • 20. >> body = RestClient.get(url) >> json = JSON.parse(body) >> content = json["Content"] >> content.size => 16 AHA!!! http://.../artistas?maxRowsList=1600&filter=Recentes >> body = RestClient.get(url) >> json = JSON.parse(a) >> content = json["Content"] >> content.size => 1600 http://.../artistas?maxRowsList=1600000&filter=Recentes >> content.size => 9154 Bingo!!!
  • 21. >> b["Content"].map {|c| c["ProfileUrl"]} ["caravella", "tomleite", "jeffersontavares", "rodrigoaraujo", "jorgemendes", "bossapunk", "daviddepiro", "freetools", "ironia", "tiagorosa", "outprofile", "lucianokoscky", "bandateatraldecarona", "tlounge", "almanaque", "razzyoficial", "cretinosecanalhas", "cincorios", "ninoantunes", "caiocorsalette", "alinedelima", "thelio", "grupodomdesamba", "ladoz", "alexandrepontes", "poeiradgua", "betimalu", "leonardobessa", "kamaross", "marcusdocavaco", "atividadeinformal", "angelkeys", "locojohn", "forcamusic", "tiaguinhoabreu", "marcelonegrao", "jstonemghiphop", "uniaoglobal", "bandaefex", "severarock", "manitu", "sasso", "kakka", "xsopretty", "belepoke", "caixaazul", "wknd", "bandastarven", "bleiamusic", "3porcentoaocubo", "lucianoterra", "hipnoia", "influencianegra", "bandaursamaior", "mariafreitas", "jessejames", "vagnerrockxe", "stageo3", "lemoneight", "innocence", "dinda", "marcelocapela", "paulocamoeseoslusiadas", "magnussrock", "bandatheburk", "mercantes", "bandaturnerock", "flaviasaolli", "tonysagga", "thiagoponde", "centeio", "grupodeubranco", "bocadeleao", "eusoueliascardan", "notoriaoficial", "planomasterrock", "rofgod", "dreemonphc", "chicobrant", "osz", "bandalightspeed", "cavernadenarnia", "sergiobenevenuto", "viniciusdeoliveira", ...]
  • 23.
  • 24. >> html = RestClient.get("http://.../robomacaco") >> html_doc = Nokogiri::HTML(html) >> info = html_doc.xpath("//span[@class='name']") >> info.text => "robo-macaco@hotmail.com RIO DE JANEIRO - RJ - Brasil 21 9675-0199
  • 25.
  • 26. cookies cookies = {} c = "s_nr=12954999; s_v19=12978609471; ... __utmc=206845458" cook = c.split(";").map {|i| i.strip.split("=")} cook.each {|u| cookies[u[0]] = u[1]} RestClient.get(url, :cookies => cookies)
  • 29.
  • 30.
  • 31.
  • 32. >> response = RestClient.get(url) >> html_doc = Nokogiri::HTML(response) >> table = html_doc.xpath("//table [@class='proxylist']") >> lines = table.children >> lines.shift # tira o cabeçalho Text IP >> lines[1].text => "208.52.144.55 document.write(":"+i+r+i+r) anonymous proxy server-2 minutes ago United States"
  • 34. JAVASCRIPT = RUBY http://www.flickr.com/photos/drics/4266471776/
  • 35. <script type="text/javascript"> z=5;i=8;x=4;l=1;o=9;q=6;n=3;u=2;k=7;r=0; </script> >> script = html_doc.xpath("//script")[1] >> eval script.text >> z => 5 >> i => 8
  • 36. >> lines[1].text => "208.52.144.55 document.write(":"+i+r+i+r) anonymous proxy server-2 minutes ago United States" >> server = lines[1].text.split[0] => "208.52.144.55" >> digits = lines[1].text.split(")")[0].split("+") => ["208.52.144.55document.write(":"", "i", "r", "i", "r"] >> digits.shift >> digits => ["i", "r", "i", "r"] >> port = digits.map {|c| eval(c)}.join("") => "8080" Voilà RestClient.proxy = "http://#{server}:#{port}"
  • 37. mechanize agent = Mechanize.new site = "http://www.cantora.mus.br" page = agent.get("#{site}/baixar") form = page.form form['visitor[name]'] = 'daniel' form['visitor[email]'] = "danicuki@gmail.com" page = agent.submit(form) tracks = page.links.select { |l| l.href =~ /track/ } tracks.each do |t| file = agent.get("#{site}#{t}) file.save end
  • 38. protection techniques javascript text as image captcha don’t be ingenuous
  • 39. captcha prove you are not a robot YES you can!
  • 40. 3 steps 1. Download Image 2. filter image 3. run OCR software
  • 41.
  • 43. clouds $ knife ec2 server create
  • 44. threads + queues
  • 45.
  • 46. Nessa vida de programador maluco Me aparece cada situação De repente um cliente, uma proposta bruta Pra pegar de um site informação Você tá louco, esse tipo de crime eu não faço Se quiser tenho uns amigos lá do sul Faz pra mim que eu te pago com essa jóia cool Te dou um ruby Pra você roubar Com o seu robô Quer fazer robô? É só usar ruby É só usar ruby Pra fazer robô http://www.flickr.com/photos/jobafunky/5572503988