Module: HashSpidey::Strategies::HashStore

Included in:
AbstractSpider
Defined in:
lib/hash_spidey/strategies/hash_store_strategy.rb

Instance Method Summary collapse

Instance Method Details

#crawl(options = {}) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 36

def crawl(options = {})
  @crawl_started_at = Time.now
  @until = Time.now + options[:crawl_for] if options[:crawl_for]

  i = 0
  each_url do |url, handler, default_data|
    break if options[:max_urls] && i >= options[:max_urls]
    begin
      page = agent.get(url)
      Spidey.logger.info "Handling #{url.inspect}"
      send handler, page, default_data
      process_crawl(url, page)
    rescue => ex
      add_error url: url, handler: handler, error: ex
    end
    sleep request_interval if request_interval > 0
    i += 1
  end
end

#crawlsObject

conveinence methods



17
18
19
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 17

def crawls
  @url_collection.select{|k,v| v.crawled?}
end

#each_url(&block) ⇒ Object



89
90
91
92
93
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 89

def each_url(&block)
  while h_url = get_next_url_hash
    yield h_url.url, h_url.handler, h_url.handle_data
  end
end

#handle(url, handler, handle_data = {}) ⇒ Object



57
58
59
60
61
62
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 57

def handle(url, handler, handle_data = {})
  Spidey.logger.info "Queueing #{url.inspect[0..200]}..."

  spider_name = self.class.name
  @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
end

#initialize(attrs = {}) ⇒ Object



5
6
7
8
9
10
11
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 5

def initialize(attrs = {})
  @url_collection = {}
  @error_collection = []
  agent.user_agent = "Abstract Spider"

  super(attrs)
end

#process_crawl(url, page) ⇒ Object



30
31
32
33
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 30

def process_crawl(url, page)
  h_url = @url_collection[url]
  h_url.mark_as_crawled(page)
end

#record(url, data_hashie) ⇒ Object

expects @url_collection to have :url, but if not, creates new HashUrlRecord

data_hashie should have :content and/or :parsed_data



66
67
68
69
70
71
72
73
74
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 66

def record(url, data_hashie)
  h_url = @url_collection[url] || HashUrlRecord.new(url)

  # set the content and record_timestamp of the HashUrlRecord
  h_url.mark_record(data_hashie)

  # reassign, update collection
  @url_collection[url] = h_url
end

#record_data(page, data) ⇒ Object



82
83
84
85
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 82

def record_data(page, data)
  url = page.uri.to_s
  record(url, parsed_data: data)
end

#record_page(page) ⇒ Object

convenience method, expecting :page to be a Nokogiri::Page



77
78
79
80
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 77

def record_page(page)
  url = page.uri.to_s
  record(url, content: page.content)
end

#recordsObject



26
27
28
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 26

def records
  @url_collection.select{|k,v| v.recorded?}
end

#uncrawledObject



22
23
24
# File 'lib/hash_spidey/strategies/hash_store_strategy.rb', line 22

def uncrawled
  @url_collection.reject{|k,v| v.crawled?}
end