7. enter mechanize + nokogiri
require "mechanize"
agent = Mechanize.new do | agent |
agent.user_agent_alias = "Linux Mozilla"
end
agent.get("http://example.com/login") do | login_page |
result_page = login_page.form_with(:name => "login") do | login_form |
login["username"] = username
login["password"] = password
end.submit
result_page.search("//table[starts-with(@class,'boundaries')]").map do | option_table |
{ "name" => option_table.search("./caption/child::text()")
"credits" => option_table.search("./descendant::td[position()=3]/child::text()")
}
end
end
8. and then...
good, but
we’d like to
extract more
informations
from a few
different
pages
9. enter commander
describe arguments
command :is_registered do | command |
command.syntax = "is_registered --username TELEPHONE_NUMBER [ --without-cache ]"
command.description = "Check if user is registered"
command.option "-u", "--username TELEPHONE_NUMBER", String, "user's telephone number"
command.option "-n", "--without-cache", "bypass user's profile informations cache"
command.when_called do | arguments, options |
options.default :username => "", :without_cache => false
ok(is_registered(options.username, options.without_cache))
end
end
extract code into functions
10. use page object pattern
def is_registered(username)
browse do | agent, configuration |
LoginPage.new(
agent.get(configuration["login_page_url"])
).is_registered?(username)
end
end
11. use page object pattern
class LoginPage < PageToScrub
def is_registered?(username)
begin
login(username, "fake password")
rescue WrongPassword
true
rescue NotRegistered, WrongUsername, WrongArea
false
end
end
def login(username, password)
check_page(
use_element(:login_form) do | login |
login["username"] = username
login["password"] = password
end.submit
)
end
def login_form
@page.form_with(:name => "login")
end
12. use page object pattern
class LoginPage < PageToScrub
def is_registered?(username)
begin
login(username, "fake password")
rescue WrongPassword
true
rescue NotRegistered, WrongUsername, WrongArea
false
end
end useful abstractions
def login(username, password)
check_page(
use_element(:login_form) do | login |
login["username"] = username
login["password"] = password
end.submit
)
end
def login_form
@page.form_with(:name => "login")
end
13. use page object pattern
class PageToScrub
...
def use_element(element_name)
element = self.send(element_name)
raise MalformedPage.new(@page, "unable to locate #{element_name}") if (
element.nil? || (element.empty? rescue true)
)
return yield(element) if block_given?
element
end
...
end
14. after a while...
...few pages
my A@@
45 pages and
93 different
pieces of
data
16. rspec is your friend :-)
describe "is_registered" do
context "XXX3760593" do
it "should be a consumer registered" do
result = command(:is_registered, :username => "XXX3760593")
result.should_not be_an_error
result["area"].should == "consumer"
result["registered"].should == true
end
end
end
17. and then...
obviously
not all the
requests
can be live
on our
systems
18. enter the cache
def browse
begin
cache = CommandCache.new(database_path)
configuration = YAML::load(File.open(configuration_path))
agent = Mechanize.new do | agent |
agent.user_agent_alias = "Linux Mozilla"
end
yield(agent, configuration, cache)
rescue Mechanize::ResponseCodeError => error
failure(LoadPageError.new(error))
rescue Timeout::Error
failure(TimeoutPageError.new)
rescue ScrubError => error
failure(error)
rescue => error
failure(UnknownError.new(error.to_s))
ensure
cache.close!
end
end
19. enter the cache
single line change
def is_registered(username, without_cache)
browse do | agent, configuration, cache |
cache.command([ username, "is_registered" ]) do
LoginPage.new(
agent.get(configuration["login_page_url"])
).is_registered?(username)
end
end
end
20. enter the cache
class CommandCache
def initialize(database_path)
@database = create_database(database_path)
end
better ask
def command(keys) forgiveness than
begin permission
from_cache(keys)
rescue NotInCache => e
raise e if not block_given?
to_cache(keys, yield)
end
end
end
21. and then...
our systems
cannot take
more than 25
concurrent
requests...
make sure of
it!!!
26. god bless ruby
class Mechanize
alias real_fetch_page fetch_page
def fetch_page(params)
...
attempts = 0
begin
attempts += 1
real_fetch_page(params)
look at this line!!!
rescue Net::HTTPServerException => error
if is_overloaded?(error)
sleep wait_for_seconds and retry if attempts < retry_for_times
raise SystemError.new("SystemOverloaded")
end
raise error
end
end
def is_overloaded?(error)
error.response.code == "403"
end
end
27. we can also test it :-)
class WEBrick::HTTPResponse
def serve(content)
self.body = content
self["Content-Length"] = content.length
end
def overloaded
serve("<html><body>squid</body></html>")
self.status = 403
end
end
proxy = WEBrick::HTTPProxyServer.new(
:Port => 2200,
:ProxyContentHandler => Proc.new do | request, response |
response.overloaded
end
)
trap("INT") { proxy.shutdown }
proxy.start
28. finally ;-)
well... i
guess we can
release it...
29. the unexpected
but... wait...
our i.t.
department
said that
sometimes it
crashes
32. If you want something done, do it yourself
how to transform a command line program
into a web application
class ScrubsHandler < Mongrel::HttpHandler
def process(request, response)
command = request.params["PATH_INFO"].tr("/", "")
elements = Mongrel::HttpRequest.query_parse(request.params["QUERY_STRING"])
parameters = elements.inject([]) do | parameters, parameter |
name, value = parameter
parameters << if value.nil?
"--#{name}"
else
"--#{name}='#{value}'"
end
end.join(" ")
almost a single line change
response.start(200) do | head, out |
head["Content-Type"] = "application/json"
out.write(scrubs.execute(command, parameters))
end
end
...
end
33. can this be true ?!?!?
well... i
guess we can
release it...
34. after a while...
all the
requests
are live!!!
our systems
are melting
down!!! fix
it!!! now!!!
38. maintenance page detection
class PageToScrub
def initialize(page)
@page = page
check_page_errors
check_for_maintenance
end
def check_for_maintenance
@page.search("//td[@class='txtbig']").each do | node |
if extract_text_from(node.search("./descendant::text()")) =~
/^.+?area.+?clienti.+?non.+?disponibile.+?stiamo.+?lavorando/im
raise OnMaintenancePage.new(@page, "??? is on maintenance")
end
end
end
...
end
39. after few days
good job
gabriele, it’s
working
beyond our
expectations
40. after few days
tell me,
these
“robots” of
yours can be
used to
check our
systems
42. in the end...
• almost all self care’s
features are replicated
• ~500.000 unique users/day
• ~12.000.000 requests/day
• ~4gb of cached data
• specs are used to
monitoring the entire
system
43. but in the beginning was...
it’s very
easy, we
need
something
quick and
dirty...