Module: HashSpidey::Strategies::HashStore

Included in:: AbstractSpider

Defined in:: lib/hash_spidey/strategies/hash_store_strategy.rb

Instance Method Summary collapse

#crawl(options = {}) ⇒ Object
#crawls ⇒ Object

conveinence methods.
#each_url(&block) ⇒ Object
#handle(url, handler, handle_data = {}) ⇒ Object
#initialize(attrs = {}) ⇒ Object
#process_crawl(url, page) ⇒ Object
#record(url, data_hashie) ⇒ Object

expects @url_collection to have :url, but if not, creates new HashUrlRecord data_hashie should have :content and/or :parsed_data.
#record_data(page, data) ⇒ Object
#record_page(page) ⇒ Object

convenience method, expecting :page to be a Nokogiri::Page.
#records ⇒ Object
#uncrawled ⇒ Object

Instance Method Details

#crawl(options = {}) ⇒ `Object`

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 36

def crawl(options = {})
  @crawl_started_at = Time.now
  @until = Time.now + options[:crawl_for] if options[:crawl_for]

  i = 0
  each_url do |url, handler, default_data|
    break if options[:max_urls] && i >= options[:max_urls]
    begin
      page = agent.get(url)
      Spidey.logger.info "Handling #{url.inspect}"
      send handler, page, default_data
      process_crawl(url, page)
    rescue => ex
      add_error url: url, handler: handler, error: ex
    end
    sleep request_interval if request_interval > 0
    i += 1
  end
end

#crawls ⇒ `Object`

conveinence methods



17
18
19

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 17

def crawls
  @url_collection.select{|k,v| v.crawled?}
end

#each_url(&block) ⇒ `Object`

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 89

def each_url(&block)
  while h_url = get_next_url_hash
    yield h_url.url, h_url.handler, h_url.handle_data
  end
end

#handle(url, handler, handle_data = {}) ⇒ `Object`

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 57

def handle(url, handler, handle_data = {})
  Spidey.logger.info "Queueing #{url.inspect[0..200]}..."

  spider_name = self.class.name
  @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
end

#initialize(attrs = {}) ⇒ `Object`

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 5

def initialize(attrs = {})
  @url_collection = {}
  @error_collection = []
  agent.user_agent = "Abstract Spider"

  super(attrs)
end

#process_crawl(url, page) ⇒ `Object`

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 30

def process_crawl(url, page)
  h_url = @url_collection[url]
  h_url.mark_as_crawled(page)
end

#record(url, data_hashie) ⇒ `Object`

expects @url_collection to have :url, but if not, creates new HashUrlRecord

data_hashie should have :content and/or :parsed_data

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 66

def record(url, data_hashie)
  h_url = @url_collection[url] || HashUrlRecord.new(url)

  # set the content and record_timestamp of the HashUrlRecord
  h_url.mark_record(data_hashie)

  # reassign, update collection
  @url_collection[url] = h_url
end

#record_data(page, data) ⇒ `Object`

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 82

def record_data(page, data)
  url = page.uri.to_s
  record(url, parsed_data: data)
end

#record_page(page) ⇒ `Object`

convenience method, expecting :page to be a Nokogiri::Page

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 77

def record_page(page)
  url = page.uri.to_s
  record(url, content: page.content)
end

#records ⇒ `Object`



26
27
28

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 26

def records
  @url_collection.select{|k,v| v.recorded?}
end

#uncrawled ⇒ `Object`



22
23
24

# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 22

def uncrawled
  @url_collection.reject{|k,v| v.crawled?}
end

Module: HashSpidey::Strategies::HashStore

Instance Method Summary collapse

Instance Method Details

#crawl(options = {}) ⇒ Object

#crawls ⇒ Object

#each_url(&block) ⇒ Object

#handle(url, handler, handle_data = {}) ⇒ Object

#initialize(attrs = {}) ⇒ Object

#process_crawl(url, page) ⇒ Object

#record(url, data_hashie) ⇒ Object

#record_data(page, data) ⇒ Object

#record_page(page) ⇒ Object

#records ⇒ Object

#uncrawled ⇒ Object