Class: Biblionet::Crawlers::PublisherCrawler

Inherits:
Base
  • Object
show all
Defined in:
lib/bookshark/crawlers/publisher_crawler.rb

Instance Method Summary collapse

Methods inherited from Base

#spider

Constructor Details

#initialize(options = {}) ⇒ PublisherCrawler

Returns a new instance of PublisherCrawler.



7
8
9
10
11
12
13
14
15
16
# File 'lib/bookshark/crawlers/publisher_crawler.rb', line 7

def initialize(options = {})
  options[:folder]    ||= 'lib/bookshark/storage/html_publisher_pages'
  options[:base_url]  ||= 'http://www.biblionet.gr/com/'
  options[:page_type] ||= 'publisher'
  options[:extension] ||= '.html'
  options[:start]     ||= 1
  options[:finish]    ||= 800
  options[:step]      ||= 100        
  super(options)
end

Instance Method Details

#crawl_and_saveObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/bookshark/crawlers/publisher_crawler.rb', line 18

def crawl_and_save 
  downloader = Extractors::Base.new

  spider do |url_to_download, file_to_save|                   
    downloader.load_page(url_to_download)

    # Create a new directory (does nothing if directory exists) 
    path = File.dirname(file_to_save)
    FileUtils.mkdir_p path unless File.directory?(path)

    downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024

  end
end