Class: Biblionet::Crawlers::PublisherCrawler
- Defined in:
- lib/bookshark/crawlers/publisher_crawler.rb
Instance Method Summary collapse
- #crawl_and_save ⇒ Object
-
#initialize(options = {}) ⇒ PublisherCrawler
constructor
A new instance of PublisherCrawler.
Methods inherited from Base
Constructor Details
#initialize(options = {}) ⇒ PublisherCrawler
Returns a new instance of PublisherCrawler.
7 8 9 10 11 12 13 14 15 16 |
# File 'lib/bookshark/crawlers/publisher_crawler.rb', line 7 def initialize( = {}) [:folder] ||= 'lib/bookshark/storage/html_publisher_pages' [:base_url] ||= 'http://www.biblionet.gr/com/' [:page_type] ||= 'publisher' [:extension] ||= '.html' [:start] ||= 1 [:finish] ||= 800 [:step] ||= 100 super() end |
Instance Method Details
#crawl_and_save ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/bookshark/crawlers/publisher_crawler.rb', line 18 def crawl_and_save downloader = Extractors::Base.new spider do |url_to_download, file_to_save| downloader.load_page(url_to_download) # Create a new directory (does nothing if directory exists) path = File.dirname(file_to_save) FileUtils.mkdir_p path unless File.directory?(path) downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024 end end |