Class: NewsCrawler::Downloader

Inherits:
Object
  • Object
show all
Includes:
Celluloid
Defined in:
lib/news_crawler/downloader.rb

Overview

This class implement an parallel downloader based on Typhoes with given queue

Constant Summary collapse

CONCURRENT_DOWNLOAD =
4

Instance Method Summary collapse

Constructor Details

#initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts) ⇒ Downloader

Construct downloader with an URLQueue

Parameters:

  • start_on_create (Boolean) (defaults to: true)

    whether start selector immediately

  • queue (NewsCrawler::URLQueue) (defaults to: NewsCrawler::Storage::URLQueue)

    url queue



43
44
45
46
47
48
49
50
51
# File 'lib/news_crawler/downloader.rb', line 43

def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
  @queue = queue
  @urls = queue.find_unvisited
  @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
  @wait_time = 1
  @status = :running
  @stoping = false
  wait_for_url if start_on_create
end

Instance Method Details

#graceful_terminateObject

Graceful terminate this downloader



81
82
83
84
85
86
# File 'lib/news_crawler/downloader.rb', line 81

def graceful_terminate
  @stoping = true
  while @status == :running
    sleep(1)
  end
end

#runObject

Start downloader with current queue URL successed fetch is marked and result’s stored in DB



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/news_crawler/downloader.rb', line 55

def run
  @status = :running
  hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
  # TODO Log here
  @urls = @urls.keep_if do | url |
    Robots.instance.allowed? url
  end
  requests = @urls.map do | url |
    re = Typhoeus::Request.new(url, followlocation: true)
    re.on_complete do | response |
      if response.success?
        Storage::RawData.add(url, response.response_body)
        @queue.mark_visited url
      else
        NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
      end
    end
    hydra.queue re
    re
  end
  hydra.run
  @urls = []
  wait_for_url
end