Module: HashSpidey::Strategies::HashStore
- Included in:
- AbstractSpider
- Defined in:
- lib/hash_spidey/strategies/hash_store_strategy.rb
Instance Method Summary collapse
- #crawl(options = {}) ⇒ Object
-
#crawls ⇒ Object
conveinence methods.
- #each_url(&block) ⇒ Object
- #handle(url, handler, handle_data = {}) ⇒ Object
- #initialize(attrs = {}) ⇒ Object
- #process_crawl(url, page) ⇒ Object
-
#record(url, data_hashie) ⇒ Object
expects @url_collection to have :url, but if not, creates new HashUrlRecord data_hashie should have :content and/or :parsed_data.
- #record_data(page, data) ⇒ Object
-
#record_page(page) ⇒ Object
convenience method, expecting :page to be a Nokogiri::Page.
- #records ⇒ Object
- #uncrawled ⇒ Object
Instance Method Details
#crawl(options = {}) ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 36 def crawl( = {}) @crawl_started_at = Time.now @until = Time.now + [:crawl_for] if [:crawl_for] i = 0 each_url do |url, handler, default_data| break if [:max_urls] && i >= [:max_urls] begin page = agent.get(url) Spidey.logger.info "Handling #{url.inspect}" send handler, page, default_data process_crawl(url, page) rescue => ex add_error url: url, handler: handler, error: ex end sleep request_interval if request_interval > 0 i += 1 end end |
#crawls ⇒ Object
conveinence methods
17 18 19 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 17 def crawls @url_collection.select{|k,v| v.crawled?} end |
#each_url(&block) ⇒ Object
89 90 91 92 93 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 89 def each_url(&block) while h_url = get_next_url_hash yield h_url.url, h_url.handler, h_url.handle_data end end |
#handle(url, handler, handle_data = {}) ⇒ Object
57 58 59 60 61 62 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 57 def handle(url, handler, handle_data = {}) Spidey.logger.info "Queueing #{url.inspect[0..200]}..." spider_name = self.class.name @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data ) end |
#initialize(attrs = {}) ⇒ Object
5 6 7 8 9 10 11 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 5 def initialize(attrs = {}) @url_collection = {} @error_collection = [] agent.user_agent = "Abstract Spider" super(attrs) end |
#process_crawl(url, page) ⇒ Object
30 31 32 33 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 30 def process_crawl(url, page) h_url = @url_collection[url] h_url.mark_as_crawled(page) end |
#record(url, data_hashie) ⇒ Object
expects @url_collection to have :url, but if not, creates new HashUrlRecord
data_hashie should have :content and/or :parsed_data
66 67 68 69 70 71 72 73 74 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 66 def record(url, data_hashie) h_url = @url_collection[url] || HashUrlRecord.new(url) # set the content and record_timestamp of the HashUrlRecord h_url.mark_record(data_hashie) # reassign, update collection @url_collection[url] = h_url end |
#record_data(page, data) ⇒ Object
82 83 84 85 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 82 def record_data(page, data) url = page.uri.to_s record(url, parsed_data: data) end |
#record_page(page) ⇒ Object
convenience method, expecting :page to be a Nokogiri::Page
77 78 79 80 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 77 def record_page(page) url = page.uri.to_s record(url, content: page.content) end |
#records ⇒ Object
26 27 28 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 26 def records @url_collection.select{|k,v| v.recorded?} end |
#uncrawled ⇒ Object
22 23 24 |
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 22 def uncrawled @url_collection.reject{|k,v| v.crawled?} end |