Class: Arachnid2::Typhoeus
- Inherits:
-
Object
- Object
- Arachnid2::Typhoeus
- Includes:
- Exoskeleton, CachedArachnidResponses
- Defined in:
- lib/arachnid2/typhoeus.rb
Constant Summary
Constants included from CachedArachnidResponses
CachedArachnidResponses::CACHE_SERVICE_URL
Instance Method Summary collapse
- #crawl(opts = {}) ⇒ Object
-
#initialize(url) ⇒ Typhoeus
constructor
A new instance of Typhoeus.
Methods included from Exoskeleton
#bound_time, #bound_urls, #browser_type, #crawl_options, #extension_ignored?, #extract_hrefs, #in_docker?, #internal_link?, #make_absolute, #maximum_load_rate, #memory_danger?, #non_html_extensions, #preflight, #process, #proxy, #skip_link?, #timeout, #vacuum
Methods included from CachedArachnidResponses
#check_config, #load_data, #put_cached_data
Constructor Details
#initialize(url) ⇒ Typhoeus
Returns a new instance of Typhoeus.
6 7 8 9 10 |
# File 'lib/arachnid2/typhoeus.rb', line 6 def initialize(url) @url = url @domain = Adomain[@url] @cached_data = [] end |
Instance Method Details
#crawl(opts = {}) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/arachnid2/typhoeus.rb', line 12 def crawl(opts = {}) preflight(opts) typhoeus_preflight until @global_queue.empty? max_concurrency.times do q = @global_queue.shift break if @global_visited.size >= [:max_urls] || \ Time.now > [:time_limit] || \ memory_danger? @global_visited.insert(q) request = ::Typhoeus::Request.new(q, ) data = load_data(@url, opts) data.each { |response| yield response } and return unless data.nil? request.on_complete do |response| @cached_data.push(response) links = process(response.effective_url, response.body) next unless links yield response vacuum(links, response.effective_url) end @hydra.queue(request) end # max_concurrency.times do @hydra.run end # until @global_queue.empty? put_cached_data(@url, opts, @cached_data) unless @cached_data.empty? ensure @cookie_file.close! if @cookie_file end |