Class: Relevance::Tarantula::Crawler
Defined Under Namespace
Classes: CrawlTimeout
Constant Summary
VERSION
Instance Attribute Summary collapse
Instance Method Summary
collapse
-
#append_to_queue(request) ⇒ Object
-
#blip(number = 0) ⇒ Object
-
#crawl(url = "/") ⇒ Object
-
#crawl_the_queue(number = 0) ⇒ Object
-
#do_crawl(number) ⇒ Object
-
#elasped_time_for_pass(num) ⇒ Object
-
#finished? ⇒ Boolean
-
#follow(method, url, data = nil) ⇒ Object
-
#generate_reports ⇒ Object
-
#grab_log! ⇒ Object
-
#handle_form_results(form, response) ⇒ Object
-
#handle_link_results(link, result) ⇒ Object
-
#index_to_insert(request) ⇒ Object
append get requests before others, delete requests to the end of the queue, all others just before the first delete request.
-
#initialize ⇒ Crawler
constructor
A new instance of Crawler.
-
#links_completed_count ⇒ Object
-
#links_remaining_count ⇒ Object
-
#make_result(options) ⇒ Object
-
#method_missing(meth, *args) ⇒ Object
-
#queue_form(form, referrer = nil) ⇒ Object
-
#queue_link(dest, referrer = nil) ⇒ Object
-
#report_dir ⇒ Object
-
#report_results ⇒ Object
-
#save_result(result) ⇒ Object
-
#should_skip_form_submission?(fs) ⇒ Boolean
-
#should_skip_link?(link) ⇒ Boolean
-
#should_skip_url?(url) ⇒ Boolean
-
#submit(method, action, data) ⇒ Object
-
#timeout_if_too_long(number = 0) ⇒ Object
-
#total_links_count ⇒ Object
-
#transform_url(url) ⇒ Object
#log, #rails_root, #tarantula_home, #verbose
Constructor Details
Returns a new instance of Crawler.
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
# File 'lib/relevance/tarantula/crawler.rb', line 22
def initialize
@max_url_length = 1024
@successes = []
@failures = []
@handlers = [@response_code_handler = Result]
@links_queued = Set.new
@form_signatures_queued = Set.new
@crawl_queue = []
@crawl_start_times, @crawl_end_times = [], []
@crawl_timeout = 20.minutes
@referrers = {}
@skip_uri_patterns = [
/^javascript/,
/^mailto/,
/^http/,
]
self.transform_url_patterns = [
[/#.*$/, '']
]
@reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
@decoder = HTMLEntities.new
@times_to_crawl = 1
@fuzzers = [Relevance::Tarantula::FormSubmission]
@stdout_tty = $stdout.tty?
end
|
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
#method_missing(meth, *args) ⇒ Object
49
50
51
52
|
# File 'lib/relevance/tarantula/crawler.rb', line 49
def method_missing(meth, *args)
super unless Result::ALLOW_NNN_FOR =~ meth.to_s
@response_code_handler.send(meth, *args)
end
|
Instance Attribute Details
#crawl_end_times ⇒ Object
Returns the value of attribute crawl_end_times.
20
21
22
|
# File 'lib/relevance/tarantula/crawler.rb', line 20
def crawl_end_times
@crawl_end_times
end
|
#crawl_queue ⇒ Object
Returns the value of attribute crawl_queue.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def crawl_queue
@crawl_queue
end
|
#crawl_start_times ⇒ Object
Returns the value of attribute crawl_start_times.
20
21
22
|
# File 'lib/relevance/tarantula/crawler.rb', line 20
def crawl_start_times
@crawl_start_times
end
|
#crawl_timeout ⇒ Object
Returns the value of attribute crawl_timeout.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def crawl_timeout
@crawl_timeout
end
|
Returns the value of attribute failures.
20
21
22
|
# File 'lib/relevance/tarantula/crawler.rb', line 20
def failures
@failures
end
|
Returns the value of attribute form_signatures_queued.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def form_signatures_queued
@form_signatures_queued
end
|
Returns the value of attribute fuzzers.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def fuzzers
@fuzzers
end
|
Returns the value of attribute handlers.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def handlers
@handlers
end
|
#links_queued ⇒ Object
Returns the value of attribute links_queued.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def links_queued
@links_queued
end
|
#log_grabber ⇒ Object
Returns the value of attribute log_grabber.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def log_grabber
@log_grabber
end
|
#max_url_length ⇒ Object
Returns the value of attribute max_url_length.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def max_url_length
@max_url_length
end
|
Returns the value of attribute proxy.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def proxy
@proxy
end
|
#referrers ⇒ Object
Returns the value of attribute referrers.
20
21
22
|
# File 'lib/relevance/tarantula/crawler.rb', line 20
def referrers
@referrers
end
|
#reporters ⇒ Object
Returns the value of attribute reporters.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def reporters
@reporters
end
|
#response_code_handler ⇒ Object
Returns the value of attribute response_code_handler.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def response_code_handler
@response_code_handler
end
|
#skip_uri_patterns ⇒ Object
Returns the value of attribute skip_uri_patterns.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def skip_uri_patterns
@skip_uri_patterns
end
|
#successes ⇒ Object
Returns the value of attribute successes.
20
21
22
|
# File 'lib/relevance/tarantula/crawler.rb', line 20
def successes
@successes
end
|
#test_name ⇒ Object
Returns the value of attribute test_name.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def test_name
@test_name
end
|
#times_to_crawl ⇒ Object
Returns the value of attribute times_to_crawl.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def times_to_crawl
@times_to_crawl
end
|
Returns the value of attribute transform_url_patterns.
20
21
22
|
# File 'lib/relevance/tarantula/crawler.rb', line 20
def transform_url_patterns
@transform_url_patterns
end
|
Instance Method Details
#append_to_queue(request) ⇒ Object
211
212
213
|
# File 'lib/relevance/tarantula/crawler.rb', line 211
def append_to_queue(request)
@crawl_queue.insert(index_to_insert(request), request)
end
|
#blip(number = 0) ⇒ Object
264
265
266
267
268
269
|
# File 'lib/relevance/tarantula/crawler.rb', line 264
def blip(number = 0)
unless verbose
print "\r #{links_completed_count} of #{total_links_count} links completed " if @stdout_tty
timeout_if_too_long(number)
end
end
|
#crawl(url = "/") ⇒ Object
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
# File 'lib/relevance/tarantula/crawler.rb', line 60
def crawl(url = "/")
orig_links_queued = @links_queued.dup
orig_form_signatures_queued = @form_signatures_queued.dup
orig_crawl_queue = @crawl_queue.dup
@times_to_crawl.times do |num|
queue_link url
begin
do_crawl num
rescue CrawlTimeout => e
puts
puts e.message
end
puts "#{ActiveSupport::Inflector.ordinalize((num+1))} crawl" if @times_to_crawl > 1
if num + 1 < @times_to_crawl
@links_queued = orig_links_queued
@form_signatures_queued = orig_form_signatures_queued
@crawl_queue = orig_crawl_queue
@referrers = {}
end
end
rescue Interrupt
$stderr.puts "CTRL-C"
ensure
report_results
end
|
#crawl_the_queue(number = 0) ⇒ Object
101
102
103
104
105
106
|
# File 'lib/relevance/tarantula/crawler.rb', line 101
def crawl_the_queue(number = 0)
while (request = @crawl_queue.shift)
request.crawl
blip(number)
end
end
|
#do_crawl(number) ⇒ Object
93
94
95
96
97
98
99
|
# File 'lib/relevance/tarantula/crawler.rb', line 93
def do_crawl(number)
while (!finished?)
@crawl_start_times << Time.now
crawl_the_queue(number)
@crawl_end_times << Time.now
end
end
|
#elasped_time_for_pass(num) ⇒ Object
133
134
135
|
# File 'lib/relevance/tarantula/crawler.rb', line 133
def elasped_time_for_pass(num)
Time.now - crawl_start_times[num]
end
|
#finished? ⇒ Boolean
89
90
91
|
# File 'lib/relevance/tarantula/crawler.rb', line 89
def finished?
@crawl_queue.empty?
end
|
#follow(method, url, data = nil) ⇒ Object
125
126
127
|
# File 'lib/relevance/tarantula/crawler.rb', line 125
def follow(method, url, data=nil)
proxy.send(method, url, data)
end
|
#generate_reports ⇒ Object
233
234
235
236
237
238
239
240
241
242
243
244
245
|
# File 'lib/relevance/tarantula/crawler.rb', line 233
def generate_reports
errors = []
reporters.each do |reporter|
begin
reporter.finish_report(test_name)
rescue RuntimeError => e
errors << e
end
end
unless errors.empty?
raise errors.map(&:message).join("\n")
end
end
|
#grab_log! ⇒ Object
137
138
139
|
# File 'lib/relevance/tarantula/crawler.rb', line 137
def grab_log!
@log_grabber && @log_grabber.grab!
end
|
149
150
151
152
153
154
155
156
157
158
159
|
# File 'lib/relevance/tarantula/crawler.rb', line 149
def handle_form_results(form, response)
handlers.each do |h|
save_result h.handle(Result.new(:method => form.meth,
:url => form.action,
:response => response,
:log => grab_log!,
:referrer => form.action,
:data => form.data.inspect,
:test_name => test_name).freeze)
end
end
|
#handle_link_results(link, result) ⇒ Object
114
115
116
117
118
119
120
121
122
123
|
# File 'lib/relevance/tarantula/crawler.rb', line 114
def handle_link_results(link, result)
handlers.each do |h|
begin
save_result h.handle(result)
rescue Exception => e
log "error handling #{link} #{e.message}"
end
end
end
|
#index_to_insert(request) ⇒ Object
append get requests before others, delete requests to the end of the queue, all others just before the first delete request
217
218
219
220
221
222
223
224
225
226
227
|
# File 'lib/relevance/tarantula/crawler.rb', line 217
def index_to_insert(request)
case request.meth
when 'get'
last_get = @crawl_queue.rindex { |r| r.meth == 'get' } || -1
last_get + 1
when 'delete'
@crawl_queue.index {|r| r.meth == 'delete' && request.url.start_with?(r.url) } || -1
else
@crawl_queue.index {|r| r.meth == 'delete' } || -1
end
end
|
#links_completed_count ⇒ Object
260
261
262
|
# File 'lib/relevance/tarantula/crawler.rb', line 260
def links_completed_count
total_links_count - links_remaining_count
end
|
#links_remaining_count ⇒ Object
256
257
258
|
# File 'lib/relevance/tarantula/crawler.rb', line 256
def links_remaining_count
@crawl_queue.size
end
|
#make_result(options) ⇒ Object
141
142
143
144
145
146
147
|
# File 'lib/relevance/tarantula/crawler.rb', line 141
def make_result(options)
defaults = {
:log => grab_log!,
:test_name => test_name
}
Result.new(defaults.merge(options)).freeze
end
|
198
199
200
201
202
203
204
205
206
207
208
209
|
# File 'lib/relevance/tarantula/crawler.rb', line 198
def queue_form(form, referrer = nil)
fuzzers.each do |fuzzer|
fuzzer.mutate(Form.new(form, self, referrer)).each do |fs|
fs.action = transform_url(fs.action)
return if should_skip_form_submission?(fs)
@referrers[fs.action] = referrer if referrer
append_to_queue(fs)
@form_signatures_queued << fs.signature
end
end
end
|
#queue_link(dest, referrer = nil) ⇒ Object
190
191
192
193
194
195
196
|
# File 'lib/relevance/tarantula/crawler.rb', line 190
def queue_link(dest, referrer = nil)
dest = Link.new(dest, self, referrer)
return if should_skip_link?(dest)
append_to_queue(dest)
@links_queued << dest
dest
end
|
#report_dir ⇒ Object
229
230
231
|
# File 'lib/relevance/tarantula/crawler.rb', line 229
def report_dir
File.join(rails_root, "tmp", "tarantula")
end
|
#report_results ⇒ Object
247
248
249
250
|
# File 'lib/relevance/tarantula/crawler.rb', line 247
def report_results
puts "Crawled #{total_links_count} links and forms."
generate_reports
end
|
#save_result(result) ⇒ Object
108
109
110
111
112
|
# File 'lib/relevance/tarantula/crawler.rb', line 108
def save_result(result)
reporters.each do |reporter|
reporter.report(result)
end
end
|
177
178
179
|
# File 'lib/relevance/tarantula/crawler.rb', line 177
def should_skip_form_submission?(fs)
should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end
|
#should_skip_link?(link) ⇒ Boolean
173
174
175
|
# File 'lib/relevance/tarantula/crawler.rb', line 173
def should_skip_link?(link)
should_skip_url?(link.href) || @links_queued.member?(link)
end
|
#should_skip_url?(url) ⇒ Boolean
161
162
163
164
165
166
167
168
169
170
171
|
# File 'lib/relevance/tarantula/crawler.rb', line 161
def should_skip_url?(url)
return true if url.blank?
if @skip_uri_patterns.any? {|pattern| pattern =~ url}
log "Skipping #{url}"
return true
end
if url.length > max_url_length
log "Skipping long url #{url}"
return true
end
end
|
#submit(method, action, data) ⇒ Object
129
130
131
|
# File 'lib/relevance/tarantula/crawler.rb', line 129
def submit(method, action, data)
proxy.send(method, action, data)
end
|
#timeout_if_too_long(number = 0) ⇒ Object
271
272
273
274
275
|
# File 'lib/relevance/tarantula/crawler.rb', line 271
def timeout_if_too_long(number = 0)
if elasped_time_for_pass(number) > crawl_timeout
raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
end
end
|
#total_links_count ⇒ Object
252
253
254
|
# File 'lib/relevance/tarantula/crawler.rb', line 252
def total_links_count
@links_queued.size + @form_signatures_queued.size
end
|
181
182
183
184
185
186
187
188
|
# File 'lib/relevance/tarantula/crawler.rb', line 181
def transform_url(url)
return unless url
url = @decoder.decode(url)
@transform_url_patterns.each do |pattern|
url = pattern[url]
end
url
end
|