the
magic
 of
ruby
gabriele lana
gabriele.lana@cleancode.it
  twitter: @gabrielelana
once upon
a time there
    was a
 developer
  working
 for a big
  company
in the beginning...



                   it’s very
                   easy, we
                      need
                  something
                  quick and
                    dirty...
in the beginning...



                1.   login
                2. go to a
                     page
                3. scrap a
                   number
                4. that’s it!!!
ok!
enter mechanize + nokogiri

require "mechanize"

agent = Mechanize.new do | agent |
  agent.user_agent_alias = "Linux Mozilla"
end

agent.get("http://example.com/login") do | login_page |

  result_page = login_page.form_with(:name => "login") do | login_form |
    login["username"] = username
    login["password"] = password
  end.submit

  result_page.search("//table[starts-with(@class,'boundaries')]").map do | option_table |
    { "name" => option_table.search("./caption/child::text()")
      "credits" => option_table.search("./descendant::td[position()=3]/child::text()")
    }
  end

end
and then...


                good, but
               we’d like to
              extract more
              informations
               from a few
                different
                  pages
enter commander



      describe arguments

command :is_registered do | command |
  command.syntax = "is_registered --username TELEPHONE_NUMBER [ --without-cache ]"
  command.description = "Check if user is registered"
  command.option "-u", "--username TELEPHONE_NUMBER", String, "user's telephone number"
  command.option "-n", "--without-cache", "bypass user's profile informations cache"

  command.when_called do | arguments, options |
    options.default :username => "", :without_cache => false
    ok(is_registered(options.username, options.without_cache))
  end
end

       extract code into functions
use page object pattern




def is_registered(username)
  browse do | agent, configuration |
    LoginPage.new(
      agent.get(configuration["login_page_url"])
    ).is_registered?(username)
  end
end
use page object pattern
class LoginPage < PageToScrub

  def is_registered?(username)
    begin
      login(username, "fake password")
    rescue WrongPassword
      true
    rescue NotRegistered, WrongUsername, WrongArea
      false
    end
  end

  def login(username, password)
    check_page(
      use_element(:login_form) do | login |
        login["username"] = username
        login["password"] = password
      end.submit
    )
  end

  def login_form
    @page.form_with(:name => "login")
  end
use page object pattern
  class LoginPage < PageToScrub

    def is_registered?(username)
      begin
        login(username, "fake password")
      rescue WrongPassword
        true
      rescue NotRegistered, WrongUsername, WrongArea
        false
      end
    end                                          useful   abstractions
    def login(username, password)
      check_page(
        use_element(:login_form) do | login |
          login["username"] = username
          login["password"] = password
        end.submit
      )
    end

    def login_form
      @page.form_with(:name => "login")
    end
use page object pattern


 class PageToScrub

   ...

   def use_element(element_name)
     element = self.send(element_name)
     raise MalformedPage.new(@page, "unable to locate #{element_name}") if (
       element.nil? || (element.empty? rescue true)
     )
     return yield(element) if block_given?
     element
   end

   ...

 end
after a while...



   ...few pages
       my A@@
  45 pages and
  93 different
     pieces of
        data
i need to
feel more
confident
with this...
rspec is your friend :-)


describe "is_registered" do

  context "XXX3760593" do

      it "should be a consumer registered" do
        result = command(:is_registered, :username => "XXX3760593")
        result.should_not be_an_error
        result["area"].should == "consumer"
        result["registered"].should == true
      end

  end

end
and then...



              obviously
              not all the
              requests
              can be live
               on our
               systems
enter the cache

def browse
  begin
    cache = CommandCache.new(database_path)
    configuration = YAML::load(File.open(configuration_path))
    agent = Mechanize.new do | agent |
      agent.user_agent_alias = "Linux Mozilla"
    end
    yield(agent, configuration, cache)
  rescue Mechanize::ResponseCodeError => error
    failure(LoadPageError.new(error))
  rescue Timeout::Error
    failure(TimeoutPageError.new)
  rescue ScrubError => error
    failure(error)
  rescue => error
    failure(UnknownError.new(error.to_s))
  ensure
    cache.close!
  end
end
enter the cache


                                        single line change


def is_registered(username, without_cache)
  browse do | agent, configuration, cache |
    cache.command([ username, "is_registered" ]) do
      LoginPage.new(
        agent.get(configuration["login_page_url"])
      ).is_registered?(username)
    end
  end
end
enter the cache


class CommandCache

  def initialize(database_path)
    @database = create_database(database_path)
  end
                                 better ask
  def command(keys)           forgiveness than
    begin                        permission
      from_cache(keys)
    rescue NotInCache => e
      raise e if not block_given?
      to_cache(keys, yield)
    end
  end

end
and then...


              our systems
               cannot take
              more than 25
              concurrent
               requests...
              make sure of
                   it!!!
@!#$$@&#
     ...
maybe we can
use a proxy
     ...
god bless mechanize

def browse
  begin
    cache = CommandCache.new(database_path)
    configuration = YAML::load(File.open(configuration_path))
    proxy = configuration["proxy"]
    agent = Mechanize.new do | agent |
      agent.user_agent_alias = "Linux Mozilla"
      agent.set_proxy(proxy["host"], proxy["port"]) if proxy
    end
    yield(agent, configuration, cache)
  rescue Mechanize::ResponseCodeError => error            single   line change
    failure(LoadPageError.new(error))
  rescue Timeout::Error
    failure(TimeoutPageError.new)
  rescue ScrubError => error
    failure(error)
  rescue => error
    failure(UnknownError.new(error.to_s))
  ensure
    cache.close!
  end
end
and then...



               well, you know,
               we have a lot of
               users, so when
                proxy says is
               overloaded you
              must retry a few
              times before give
                      up
@!#$$@&#
god bless ruby
class Mechanize

  alias real_fetch_page fetch_page

  def fetch_page(params)
    ...
    attempts = 0
    begin
      attempts += 1
      real_fetch_page(params)
                                                            look at      this line!!!
    rescue Net::HTTPServerException => error
      if is_overloaded?(error)
        sleep wait_for_seconds and retry if attempts < retry_for_times
        raise SystemError.new("SystemOverloaded")
      end
      raise error
    end
  end

  def is_overloaded?(error)
    error.response.code == "403"
  end

end
we can also test it :-)
class WEBrick::HTTPResponse

  def serve(content)
    self.body = content
    self["Content-Length"] = content.length
  end

  def overloaded
    serve("<html><body>squid</body></html>")
    self.status = 403
  end

end



proxy = WEBrick::HTTPProxyServer.new(
  :Port => 2200,
  :ProxyContentHandler => Proc.new do | request, response |
    response.overloaded
  end
)

trap("INT") { proxy.shutdown }
proxy.start
finally ;-)




                 well... i
              guess we can
               release it...
the unexpected



                 but... wait...
                   our i.t.
                 department
                  said that
                 sometimes it
                   crashes
the unexpected




             you need to
               fix it by
              tomorrow
                   !!!
@!#$$@&#
@!#$$@&#
@!#$$@&#
@!#$$@&#
@!#$$@&#
@!#$$@&#
If you want something done, do it yourself
how to transform a command line program
into a web application

class ScrubsHandler < Mongrel::HttpHandler

  def process(request, response)
    command = request.params["PATH_INFO"].tr("/", "")
    elements = Mongrel::HttpRequest.query_parse(request.params["QUERY_STRING"])
    parameters = elements.inject([]) do | parameters, parameter |
      name, value = parameter
      parameters << if value.nil?
        "--#{name}"
      else
        "--#{name}='#{value}'"
      end
    end.join(" ")
                                               almost a single line change
    response.start(200) do | head, out |
      head["Content-Type"] = "application/json"
      out.write(scrubs.execute(command, parameters))
    end
  end

  ...
end
can this be true ?!?!?




                  well... i
               guess we can
                release it...
after a while...


                       all the
                     requests
                      are live!!!
                   our systems
                    are melting
                    down!!! fix
                     it!!! now!!!
@!#$$@&#
@!#$$@&#
@!#$$@&#
@!#$$@&#
@!#$$@&#
@!#$$@&#
change the cache implementation
use the file system luke...

def expire(keys, result = nil)
  FileUtils.rm path(keys), :force => true
  result.merge({ "from_cache" => false }) unless result.nil?
end

def expire_after(keys, seconds, result = nil)
  expire(keys, result) if (from_cache(keys)["cached_at"] + seconds) <= now rescue nil
end

def from_cache(keys)
  cache_file_path = path(keys)
  raise NotInCache.new(keys) unless File.exists?(cache_file_path)
  JSON.parse(File.read(cache_file_path)).merge({ "from_cache" => true })
end

def to_cache(keys, result)
  result = result.merge({ "cached_at" => now })
  File.write(path(keys), JSON.generate(result))
  result.merge({ "from_cache" => false })
end
after a while...




                     are you
                   handling the
                   maintenance
                   page right?
maintenance page detection


class PageToScrub

  def initialize(page)
    @page = page
    check_page_errors
    check_for_maintenance
  end

   def check_for_maintenance
    @page.search("//td[@class='txtbig']").each do | node |
      if extract_text_from(node.search("./descendant::text()")) =~
          /^.+?area.+?clienti.+?non.+?disponibile.+?stiamo.+?lavorando/im
        raise OnMaintenancePage.new(@page, "??? is on maintenance")
      end
    end
  end

  ...

end
after few days




              good job
             gabriele, it’s
               working
             beyond our
             expectations
after few days


               tell me,
                these
             “robots” of
             yours can be
               used to
              check our
               systems
yes!




 ...
in the end...



  • almost all self care’s
      features are replicated
  •   ~500.000 unique users/day
  •   ~12.000.000 requests/day
  •   ~4gb of cached data
  •   specs are used to
      monitoring the entire
      system
but in the beginning was...



                 it’s very
                 easy, we
                    need
                something
                quick and
                  dirty...
that is for
me the ruby
  magic :-)
questions?
gabriele lana
gabriele.lana@cleancode.it
  twitter: @gabrielelana

Magic of Ruby

  • 1.
  • 2.
  • 3.
    once upon a timethere was a developer working for a big company
  • 4.
    in the beginning... it’s very easy, we need something quick and dirty...
  • 5.
    in the beginning... 1. login 2. go to a page 3. scrap a number 4. that’s it!!!
  • 6.
  • 7.
    enter mechanize +nokogiri require "mechanize" agent = Mechanize.new do | agent | agent.user_agent_alias = "Linux Mozilla" end agent.get("http://example.com/login") do | login_page | result_page = login_page.form_with(:name => "login") do | login_form | login["username"] = username login["password"] = password end.submit result_page.search("//table[starts-with(@class,'boundaries')]").map do | option_table | { "name" => option_table.search("./caption/child::text()") "credits" => option_table.search("./descendant::td[position()=3]/child::text()") } end end
  • 8.
    and then... good, but we’d like to extract more informations from a few different pages
  • 9.
    enter commander describe arguments command :is_registered do | command | command.syntax = "is_registered --username TELEPHONE_NUMBER [ --without-cache ]" command.description = "Check if user is registered" command.option "-u", "--username TELEPHONE_NUMBER", String, "user's telephone number" command.option "-n", "--without-cache", "bypass user's profile informations cache" command.when_called do | arguments, options | options.default :username => "", :without_cache => false ok(is_registered(options.username, options.without_cache)) end end extract code into functions
  • 10.
    use page objectpattern def is_registered(username) browse do | agent, configuration | LoginPage.new( agent.get(configuration["login_page_url"]) ).is_registered?(username) end end
  • 11.
    use page objectpattern class LoginPage < PageToScrub def is_registered?(username) begin login(username, "fake password") rescue WrongPassword true rescue NotRegistered, WrongUsername, WrongArea false end end def login(username, password) check_page( use_element(:login_form) do | login | login["username"] = username login["password"] = password end.submit ) end def login_form @page.form_with(:name => "login") end
  • 12.
    use page objectpattern class LoginPage < PageToScrub def is_registered?(username) begin login(username, "fake password") rescue WrongPassword true rescue NotRegistered, WrongUsername, WrongArea false end end useful abstractions def login(username, password) check_page( use_element(:login_form) do | login | login["username"] = username login["password"] = password end.submit ) end def login_form @page.form_with(:name => "login") end
  • 13.
    use page objectpattern class PageToScrub ... def use_element(element_name) element = self.send(element_name) raise MalformedPage.new(@page, "unable to locate #{element_name}") if ( element.nil? || (element.empty? rescue true) ) return yield(element) if block_given? element end ... end
  • 14.
    after a while... ...few pages my A@@ 45 pages and 93 different pieces of data
  • 15.
    i need to feelmore confident with this...
  • 16.
    rspec is yourfriend :-) describe "is_registered" do context "XXX3760593" do it "should be a consumer registered" do result = command(:is_registered, :username => "XXX3760593") result.should_not be_an_error result["area"].should == "consumer" result["registered"].should == true end end end
  • 17.
    and then... obviously not all the requests can be live on our systems
  • 18.
    enter the cache defbrowse begin cache = CommandCache.new(database_path) configuration = YAML::load(File.open(configuration_path)) agent = Mechanize.new do | agent | agent.user_agent_alias = "Linux Mozilla" end yield(agent, configuration, cache) rescue Mechanize::ResponseCodeError => error failure(LoadPageError.new(error)) rescue Timeout::Error failure(TimeoutPageError.new) rescue ScrubError => error failure(error) rescue => error failure(UnknownError.new(error.to_s)) ensure cache.close! end end
  • 19.
    enter the cache single line change def is_registered(username, without_cache) browse do | agent, configuration, cache | cache.command([ username, "is_registered" ]) do LoginPage.new( agent.get(configuration["login_page_url"]) ).is_registered?(username) end end end
  • 20.
    enter the cache classCommandCache def initialize(database_path) @database = create_database(database_path) end better ask def command(keys) forgiveness than begin permission from_cache(keys) rescue NotInCache => e raise e if not block_given? to_cache(keys, yield) end end end
  • 21.
    and then... our systems cannot take more than 25 concurrent requests... make sure of it!!!
  • 22.
    @!#$$@&# ... maybe we can use a proxy ...
  • 23.
    god bless mechanize defbrowse begin cache = CommandCache.new(database_path) configuration = YAML::load(File.open(configuration_path)) proxy = configuration["proxy"] agent = Mechanize.new do | agent | agent.user_agent_alias = "Linux Mozilla" agent.set_proxy(proxy["host"], proxy["port"]) if proxy end yield(agent, configuration, cache) rescue Mechanize::ResponseCodeError => error single line change failure(LoadPageError.new(error)) rescue Timeout::Error failure(TimeoutPageError.new) rescue ScrubError => error failure(error) rescue => error failure(UnknownError.new(error.to_s)) ensure cache.close! end end
  • 24.
    and then... well, you know, we have a lot of users, so when proxy says is overloaded you must retry a few times before give up
  • 25.
  • 26.
    god bless ruby classMechanize alias real_fetch_page fetch_page def fetch_page(params) ... attempts = 0 begin attempts += 1 real_fetch_page(params) look at this line!!! rescue Net::HTTPServerException => error if is_overloaded?(error) sleep wait_for_seconds and retry if attempts < retry_for_times raise SystemError.new("SystemOverloaded") end raise error end end def is_overloaded?(error) error.response.code == "403" end end
  • 27.
    we can alsotest it :-) class WEBrick::HTTPResponse def serve(content) self.body = content self["Content-Length"] = content.length end def overloaded serve("<html><body>squid</body></html>") self.status = 403 end end proxy = WEBrick::HTTPProxyServer.new( :Port => 2200, :ProxyContentHandler => Proc.new do | request, response | response.overloaded end ) trap("INT") { proxy.shutdown } proxy.start
  • 28.
    finally ;-) well... i guess we can release it...
  • 29.
    the unexpected but... wait... our i.t. department said that sometimes it crashes
  • 30.
    the unexpected you need to fix it by tomorrow !!!
  • 31.
  • 32.
    If you wantsomething done, do it yourself how to transform a command line program into a web application class ScrubsHandler < Mongrel::HttpHandler def process(request, response) command = request.params["PATH_INFO"].tr("/", "") elements = Mongrel::HttpRequest.query_parse(request.params["QUERY_STRING"]) parameters = elements.inject([]) do | parameters, parameter | name, value = parameter parameters << if value.nil? "--#{name}" else "--#{name}='#{value}'" end end.join(" ") almost a single line change response.start(200) do | head, out | head["Content-Type"] = "application/json" out.write(scrubs.execute(command, parameters)) end end ... end
  • 33.
    can this betrue ?!?!? well... i guess we can release it...
  • 34.
    after a while... all the requests are live!!! our systems are melting down!!! fix it!!! now!!!
  • 35.
  • 36.
    change the cacheimplementation use the file system luke... def expire(keys, result = nil) FileUtils.rm path(keys), :force => true result.merge({ "from_cache" => false }) unless result.nil? end def expire_after(keys, seconds, result = nil) expire(keys, result) if (from_cache(keys)["cached_at"] + seconds) <= now rescue nil end def from_cache(keys) cache_file_path = path(keys) raise NotInCache.new(keys) unless File.exists?(cache_file_path) JSON.parse(File.read(cache_file_path)).merge({ "from_cache" => true }) end def to_cache(keys, result) result = result.merge({ "cached_at" => now }) File.write(path(keys), JSON.generate(result)) result.merge({ "from_cache" => false }) end
  • 37.
    after a while... are you handling the maintenance page right?
  • 38.
    maintenance page detection classPageToScrub def initialize(page) @page = page check_page_errors check_for_maintenance end def check_for_maintenance @page.search("//td[@class='txtbig']").each do | node | if extract_text_from(node.search("./descendant::text()")) =~ /^.+?area.+?clienti.+?non.+?disponibile.+?stiamo.+?lavorando/im raise OnMaintenancePage.new(@page, "??? is on maintenance") end end end ... end
  • 39.
    after few days good job gabriele, it’s working beyond our expectations
  • 40.
    after few days tell me, these “robots” of yours can be used to check our systems
  • 41.
  • 42.
    in the end... • almost all self care’s features are replicated • ~500.000 unique users/day • ~12.000.000 requests/day • ~4gb of cached data • specs are used to monitoring the entire system
  • 43.
    but in thebeginning was... it’s very easy, we need something quick and dirty...
  • 44.
    that is for methe ruby magic :-)
  • 45.
  • 46.