Class: NewsCrawler::Downloader

Inherits:

Object

Object
NewsCrawler::Downloader

show all

Includes:: Celluloid

Defined in:: lib/news_crawler/downloader.rb

Overview

This class implement an parallel downloader based on Typhoes with given queue

Constant Summary collapse

CONCURRENT_DOWNLOAD =

Instance Method Summary collapse

#graceful_terminate ⇒ Object

Graceful terminate this downloader.
#initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts) ⇒ Downloader constructor

Construct downloader with an URLQueue.
#run ⇒ Object

Start downloader with current queue URL successed fetch is marked and result’s stored in DB.

Constructor Details

#initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts) ⇒ `Downloader`

Construct downloader with an URLQueue

Parameters:

start_on_create (Boolean) (defaults to: true) —

whether start selector immediately
queue (NewsCrawler::URLQueue) (defaults to: NewsCrawler::Storage::URLQueue) —

url queue

# File 'lib/news_crawler/downloader.rb', line 43

def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
  @queue = queue
  @urls = queue.find_unvisited
  @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
  @wait_time = 1
  @status = :running
  @stoping = false
  wait_for_url if start_on_create
end

Instance Method Details

#graceful_terminate ⇒ `Object`

Graceful terminate this downloader

# File 'lib/news_crawler/downloader.rb', line 81

def graceful_terminate
  @stoping = true
  while @status == :running
    sleep(1)
  end
end

#run ⇒ `Object`

Start downloader with current queue URL successed fetch is marked and result’s stored in DB

# File 'lib/news_crawler/downloader.rb', line 55

def run
  @status = :running
  hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
  # TODO Log here
  @urls = @urls.keep_if do | url |
    Robots.instance.allowed? url
  end
  requests = @urls.map do | url |
    re = Typhoeus::Request.new(url, followlocation: true)
    re.on_complete do | response |
      if response.success?
        Storage::RawData.add(url, response.response_body)
        @queue.mark_visited url
      else
        NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
      end
    end
    hydra.queue re
    re
  end
  hydra.run
  @urls = []
  wait_for_url
end