Method: Arachnid2#crawl

Defined in:
lib/arachnid2.rb

#crawl(opts = {}, with_watir = false) ⇒ Object

Visits a URL, gathering links and visiting them, until running out of time, memory or attempts.

Examples:

url = "https://daringfireball.net"
spider = Arachnid2.new(url)

opts = {
  :followlocation => true,
  :timeout => 25000,
  :time_box => 30,
  :headers => {
    'Accept-Language' => "en-UK",
    'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
  },
  :memory_limit => 89.99,
  :proxy => {
    :ip => "1.2.3.4",
    :port => "1234",
    :username => "sam",
    :password => "coolcoolcool",
  }
  :non_html_extensions => {
    3 => [".abc", ".xyz"],
    4 => [".abcd"],
    6 => [".abcdef"],
    11 => [".abcdefghijk"]
  }
}
responses = []
spider.crawl(opts) { |response|
  responses << response
}


108
109
110
111
112
113
114
# File 'lib/arachnid2.rb', line 108

def crawl(opts = {}, with_watir = false)
  if with_watir
    crawl_watir(opts, &Proc.new)
  else
    Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new)
  end
end