Class: Kabutops::Crawler

Direct Known Subclasses

Spider

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Extensions::CallbackSupport

callbacks, manager, notify

Methods included from Extensions::Includable

#append_features, #included

Methods included from Kabutops::CrawlerExtensions::PStoreStorage

#storage

Methods included from Extensions::Logging

#logger

Class Method Details

.<<(resource) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/kabutops/crawler.rb', line 44

def << resource
  if debug
    params[:collection] ||= []
    params[:collection] << resource
    return
  end

  key = resource[:id] || resource[:url]

  if key.nil?
    raise "url must be specified for resource"
  else
    perform_async(resource.to_hash)
  end
end

.adaptersObject



20
21
22
# File 'lib/kabutops/crawler.rb', line 20

def adapters
  @adapters ||= []
end

.crawl(collection = nil) ⇒ Object



33
34
35
36
37
38
39
40
41
42
# File 'lib/kabutops/crawler.rb', line 33

def crawl collection=nil
  if storage[:status].nil?
    (collection || params[:collection] || []).each do |resource|
      self << resource
    end
    storage[:status] = :in_progress
  elsif storage[:status] == :in_progress
    # pass
  end
end

.crawl!(collection = nil) ⇒ Object



28
29
30
31
# File 'lib/kabutops/crawler.rb', line 28

def crawl! collection=nil
  reset!
  crawl(collection)
end

.reset!Object



24
25
26
# File 'lib/kabutops/crawler.rb', line 24

def reset!
  storage[:status] = nil
end

Instance Method Details

#<<(resource) ⇒ Object



86
87
88
# File 'lib/kabutops/crawler.rb', line 86

def << resource
  self.class << resource
end

#agentObject



127
128
129
130
131
132
133
134
135
136
# File 'lib/kabutops/crawler.rb', line 127

def agent
  if params[:agent].is_a?(Proc)
    @agent = params[:agent].call
  elsif @agent.nil?
    @agent = params[:agent] || Mechanize.new
    @agent.set_proxy(*params[:proxy]) if params[:proxy]
  end

  @agent
end

#crawl(resource) ⇒ Object



94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/kabutops/crawler.rb', line 94

def crawl resource
  page = get_cache_or_hit(resource)
  self.class.notify(:after_crawl, resource, page)
  page
rescue Mechanize::ResponseCodeError => e
  if e.response_code.to_i == 404
    nil
  else
    logger.error(e.response_code)
    raise
  end
end

#get_cache_or_hit(resource) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/kabutops/crawler.rb', line 107

def get_cache_or_hit resource
  cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
  page = nil

  content = Cachy.cache_if(params.cache, cache_key) do
    sleep params[:wait] || 0 # wait only if value is not from cache
    page = get_page(resource[:url])
    self.class.notify(:before_cache, resource, page)
    page.to_s
  end

  page ? page : Nokogiri::HTML(content)
end

#get_page(url) ⇒ Object



121
122
123
124
125
# File 'lib/kabutops/crawler.rb', line 121

def get_page url
  body = agent.get(url).body
  body.encode!('utf-8', params[:encoding]) if params[:encoding]
  Nokogiri::HTML(body)
end

#paramsObject



90
91
92
# File 'lib/kabutops/crawler.rb', line 90

def params
  self.class.params
end

#perform(resource) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/kabutops/crawler.rb', line 61

def perform resource
  resource = Hashie::Mash.new(resource)

  adapters = self.class.adapters.select do |adapter|
    params.skip_existing ? adapter.find(resource).nil? : true
  end

  return if adapters.nil?
  page = crawl(resource)
  return if page.nil?
  return unless (self.class.notify(:store_if, resource, page) || []).all?

  adapters.each do |adapter|
    adapter.process(resource, page)
  end
rescue Exception => e
  unless self.class.debug
    logger.error(e.message)
    logger.error(e.backtrace.join("\n"))
  end

  sleep params[:wait] || 0
  raise e
end