Class: Kabutops::Crawler
Direct Known Subclasses
Spider
Class Method Summary
collapse
Instance Method Summary
collapse
callbacks, manager, notify
#append_features, #included
#storage
#logger
Class Method Details
.<<(resource) ⇒ Object
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
# File 'lib/kabutops/crawler.rb', line 44
def << resource
if debug
params[:collection] ||= []
params[:collection] << resource
return
end
key = resource[:id] || resource[:url]
if key.nil?
raise "url must be specified for resource"
else
perform_async(resource.to_hash)
end
end
|
.adapters ⇒ Object
20
21
22
|
# File 'lib/kabutops/crawler.rb', line 20
def adapters
@adapters ||= []
end
|
.crawl(collection = nil) ⇒ Object
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/kabutops/crawler.rb', line 33
def crawl collection=nil
if storage[:status].nil?
(collection || params[:collection] || []).each do |resource|
self << resource
end
storage[:status] = :in_progress
elsif storage[:status] == :in_progress
end
end
|
.crawl!(collection = nil) ⇒ Object
28
29
30
31
|
# File 'lib/kabutops/crawler.rb', line 28
def crawl! collection=nil
reset!
crawl(collection)
end
|
.reset! ⇒ Object
24
25
26
|
# File 'lib/kabutops/crawler.rb', line 24
def reset!
storage[:status] = nil
end
|
Instance Method Details
#<<(resource) ⇒ Object
86
87
88
|
# File 'lib/kabutops/crawler.rb', line 86
def << resource
self.class << resource
end
|
#agent ⇒ Object
127
128
129
130
131
132
133
134
135
136
|
# File 'lib/kabutops/crawler.rb', line 127
def agent
if params[:agent].is_a?(Proc)
@agent = params[:agent].call
elsif @agent.nil?
@agent = params[:agent] || Mechanize.new
@agent.set_proxy(*params[:proxy]) if params[:proxy]
end
@agent
end
|
#crawl(resource) ⇒ Object
94
95
96
97
98
99
100
101
102
103
104
105
|
# File 'lib/kabutops/crawler.rb', line 94
def crawl resource
page = get_cache_or_hit(resource)
self.class.notify(:after_crawl, resource, page)
page
rescue Mechanize::ResponseCodeError => e
if e.response_code.to_i == 404
nil
else
logger.error(e.response_code)
raise
end
end
|
#get_cache_or_hit(resource) ⇒ Object
107
108
109
110
111
112
113
114
115
116
117
118
119
|
# File 'lib/kabutops/crawler.rb', line 107
def get_cache_or_hit resource
cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
page = nil
content = Cachy.cache_if(params.cache, cache_key) do
sleep params[:wait] || 0 page = get_page(resource[:url])
self.class.notify(:before_cache, resource, page)
page.to_s
end
page ? page : Nokogiri::HTML(content)
end
|
#get_page(url) ⇒ Object
121
122
123
124
125
|
# File 'lib/kabutops/crawler.rb', line 121
def get_page url
body = agent.get(url).body
body.encode!('utf-8', params[:encoding]) if params[:encoding]
Nokogiri::HTML(body)
end
|
#params ⇒ Object
90
91
92
|
# File 'lib/kabutops/crawler.rb', line 90
def params
self.class.params
end
|
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
# File 'lib/kabutops/crawler.rb', line 61
def perform resource
resource = Hashie::Mash.new(resource)
adapters = self.class.adapters.select do |adapter|
params.skip_existing ? adapter.find(resource).nil? : true
end
return if adapters.nil?
page = crawl(resource)
return if page.nil?
return unless (self.class.notify(:store_if, resource, page) || []).all?
adapters.each do |adapter|
adapter.process(resource, page)
end
rescue Exception => e
unless self.class.debug
logger.error(e.message)
logger.error(e.backtrace.join("\n"))
end
sleep params[:wait] || 0
raise e
end
|