Class: Relevance::Tarantula::Crawler
Defined Under Namespace
Classes: CrawlTimeout
Constant Summary
VERSION
Instance Attribute Summary collapse
Instance Method Summary
collapse
#log, #rails_root, #tarantula_home, #verbose
Constructor Details
Returns a new instance of Crawler.
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
# File 'lib/relevance/tarantula/crawler.rb', line 21
def initialize
@max_url_length = 1024
@successes = []
@failures = []
@handlers = [@response_code_handler = Result]
@links_queued = Set.new
@form_signatures_queued = Set.new
@crawl_queue = []
@crawl_start_times, @crawl_end_times = [], []
@crawl_timeout = 20.minutes
@referrers = {}
@skip_uri_patterns = [
/^javascript/,
/^mailto/,
/^http/,
]
self.transform_url_patterns = [
[/#.*$/, '']
]
@reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
@decoder = HTMLEntities.new
@times_to_crawl = 1
@fuzzers = [Relevance::Tarantula::FormSubmission]
@stdout_tty = $stdout.tty?
end
|
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
#method_missing(meth, *args) ⇒ Object
48
49
50
51
|
# File 'lib/relevance/tarantula/crawler.rb', line 48
def method_missing(meth, *args)
super unless Result::ALLOW_NNN_FOR =~ meth.to_s
@response_code_handler.send(meth, *args)
end
|
Instance Attribute Details
#crawl_end_times ⇒ Object
Returns the value of attribute crawl_end_times.
19
20
21
|
# File 'lib/relevance/tarantula/crawler.rb', line 19
def crawl_end_times
@crawl_end_times
end
|
#crawl_queue ⇒ Object
Returns the value of attribute crawl_queue.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def crawl_queue
@crawl_queue
end
|
#crawl_start_times ⇒ Object
Returns the value of attribute crawl_start_times.
19
20
21
|
# File 'lib/relevance/tarantula/crawler.rb', line 19
def crawl_start_times
@crawl_start_times
end
|
#crawl_timeout ⇒ Object
Returns the value of attribute crawl_timeout.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def crawl_timeout
@crawl_timeout
end
|
Returns the value of attribute failures.
19
20
21
|
# File 'lib/relevance/tarantula/crawler.rb', line 19
def failures
@failures
end
|
Returns the value of attribute form_signatures_queued.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def form_signatures_queued
@form_signatures_queued
end
|
Returns the value of attribute fuzzers.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def fuzzers
@fuzzers
end
|
Returns the value of attribute handlers.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def handlers
@handlers
end
|
#links_queued ⇒ Object
Returns the value of attribute links_queued.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def links_queued
@links_queued
end
|
#log_grabber ⇒ Object
Returns the value of attribute log_grabber.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def log_grabber
@log_grabber
end
|
#max_url_length ⇒ Object
Returns the value of attribute max_url_length.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def max_url_length
@max_url_length
end
|
Returns the value of attribute proxy.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def proxy
@proxy
end
|
#referrers ⇒ Object
Returns the value of attribute referrers.
19
20
21
|
# File 'lib/relevance/tarantula/crawler.rb', line 19
def referrers
@referrers
end
|
#reporters ⇒ Object
Returns the value of attribute reporters.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def reporters
@reporters
end
|
#response_code_handler ⇒ Object
Returns the value of attribute response_code_handler.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def response_code_handler
@response_code_handler
end
|
#skip_uri_patterns ⇒ Object
Returns the value of attribute skip_uri_patterns.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def skip_uri_patterns
@skip_uri_patterns
end
|
#successes ⇒ Object
Returns the value of attribute successes.
19
20
21
|
# File 'lib/relevance/tarantula/crawler.rb', line 19
def successes
@successes
end
|
#test_name ⇒ Object
Returns the value of attribute test_name.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def test_name
@test_name
end
|
#times_to_crawl ⇒ Object
Returns the value of attribute times_to_crawl.
15
16
17
|
# File 'lib/relevance/tarantula/crawler.rb', line 15
def times_to_crawl
@times_to_crawl
end
|
Returns the value of attribute transform_url_patterns.
19
20
21
|
# File 'lib/relevance/tarantula/crawler.rb', line 19
def transform_url_patterns
@transform_url_patterns
end
|
Instance Method Details
#blip(number = 0) ⇒ Object
245
246
247
248
249
250
|
# File 'lib/relevance/tarantula/crawler.rb', line 245
def blip(number = 0)
unless verbose
print "\r #{links_completed_count} of #{total_links_count} links completed " if @stdout_tty
timeout_if_too_long(number)
end
end
|
#crawl(url = "/") ⇒ Object
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
# File 'lib/relevance/tarantula/crawler.rb', line 59
def crawl(url = "/")
orig_links_queued = @links_queued.dup
orig_form_signatures_queued = @form_signatures_queued.dup
orig_crawl_queue = @crawl_queue.dup
@times_to_crawl.times do |num|
queue_link url
begin
do_crawl num
rescue CrawlTimeout => e
puts
puts e.message
end
puts "#{ActiveSupport::Inflector.ordinalize((num+1))} crawl" if @times_to_crawl > 1
if num + 1 < @times_to_crawl
@links_queued = orig_links_queued
@form_signatures_queued = orig_form_signatures_queued
@crawl_queue = orig_crawl_queue
@referrers = {}
end
end
rescue Interrupt
$stderr.puts "CTRL-C"
ensure
report_results
end
|
#crawl_the_queue(number = 0) ⇒ Object
100
101
102
103
104
105
|
# File 'lib/relevance/tarantula/crawler.rb', line 100
def crawl_the_queue(number = 0)
while (request = @crawl_queue.pop)
request.crawl
blip(number)
end
end
|
#do_crawl(number) ⇒ Object
92
93
94
95
96
97
98
|
# File 'lib/relevance/tarantula/crawler.rb', line 92
def do_crawl(number)
while (!finished?)
@crawl_start_times << Time.now
crawl_the_queue(number)
@crawl_end_times << Time.now
end
end
|
#elasped_time_for_pass(num) ⇒ Object
132
133
134
|
# File 'lib/relevance/tarantula/crawler.rb', line 132
def elasped_time_for_pass(num)
Time.now - crawl_start_times[num]
end
|
#finished? ⇒ Boolean
88
89
90
|
# File 'lib/relevance/tarantula/crawler.rb', line 88
def finished?
@crawl_queue.empty?
end
|
#follow(method, url, data = nil) ⇒ Object
124
125
126
|
# File 'lib/relevance/tarantula/crawler.rb', line 124
def follow(method, url, data=nil)
proxy.send(method, url, data)
end
|
#generate_reports ⇒ Object
214
215
216
217
218
219
220
221
222
223
224
225
226
|
# File 'lib/relevance/tarantula/crawler.rb', line 214
def generate_reports
errors = []
reporters.each do |reporter|
begin
reporter.finish_report(test_name)
rescue RuntimeError => e
errors << e
end
end
unless errors.empty?
raise errors.map(&:message).join("\n")
end
end
|
#grab_log! ⇒ Object
136
137
138
|
# File 'lib/relevance/tarantula/crawler.rb', line 136
def grab_log!
@log_grabber && @log_grabber.grab!
end
|
148
149
150
151
152
153
154
155
156
157
158
|
# File 'lib/relevance/tarantula/crawler.rb', line 148
def handle_form_results(form, response)
handlers.each do |h|
save_result h.handle(Result.new(:method => form.method,
:url => form.action,
:response => response,
:log => grab_log!,
:referrer => form.action,
:data => form.data.inspect,
:test_name => test_name).freeze)
end
end
|
#handle_link_results(link, result) ⇒ Object
113
114
115
116
117
118
119
120
121
122
|
# File 'lib/relevance/tarantula/crawler.rb', line 113
def handle_link_results(link, result)
handlers.each do |h|
begin
save_result h.handle(result)
rescue Exception => e
log "error handling #{link} #{e.message}"
end
end
end
|
#links_completed_count ⇒ Object
241
242
243
|
# File 'lib/relevance/tarantula/crawler.rb', line 241
def links_completed_count
total_links_count - links_remaining_count
end
|
#links_remaining_count ⇒ Object
237
238
239
|
# File 'lib/relevance/tarantula/crawler.rb', line 237
def links_remaining_count
@crawl_queue.size
end
|
#make_result(options) ⇒ Object
140
141
142
143
144
145
146
|
# File 'lib/relevance/tarantula/crawler.rb', line 140
def make_result(options)
defaults = {
:log => grab_log!,
:test_name => test_name
}
Result.new(defaults.merge(options)).freeze
end
|
197
198
199
200
201
202
203
204
205
206
207
208
|
# File 'lib/relevance/tarantula/crawler.rb', line 197
def queue_form(form, referrer = nil)
fuzzers.each do |fuzzer|
fuzzer.mutate(Form.new(form, self, referrer)).each do |fs|
fs.action = transform_url(fs.action)
return if should_skip_form_submission?(fs)
@referrers[fs.action] = referrer if referrer
@crawl_queue << fs
@form_signatures_queued << fs.signature
end
end
end
|
#queue_link(dest, referrer = nil) ⇒ Object
189
190
191
192
193
194
195
|
# File 'lib/relevance/tarantula/crawler.rb', line 189
def queue_link(dest, referrer = nil)
dest = Link.new(dest, self, referrer)
return if should_skip_link?(dest)
@crawl_queue << dest
@links_queued << dest
dest
end
|
#report_dir ⇒ Object
210
211
212
|
# File 'lib/relevance/tarantula/crawler.rb', line 210
def report_dir
File.join(rails_root, "tmp", "tarantula")
end
|
#report_results ⇒ Object
228
229
230
231
|
# File 'lib/relevance/tarantula/crawler.rb', line 228
def report_results
puts "Crawled #{total_links_count} links and forms."
generate_reports
end
|
#save_result(result) ⇒ Object
107
108
109
110
111
|
# File 'lib/relevance/tarantula/crawler.rb', line 107
def save_result(result)
reporters.each do |reporter|
reporter.report(result)
end
end
|
176
177
178
|
# File 'lib/relevance/tarantula/crawler.rb', line 176
def should_skip_form_submission?(fs)
should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end
|
#should_skip_link?(link) ⇒ Boolean
172
173
174
|
# File 'lib/relevance/tarantula/crawler.rb', line 172
def should_skip_link?(link)
should_skip_url?(link.href) || @links_queued.member?(link)
end
|
#should_skip_url?(url) ⇒ Boolean
160
161
162
163
164
165
166
167
168
169
170
|
# File 'lib/relevance/tarantula/crawler.rb', line 160
def should_skip_url?(url)
return true if url.blank?
if @skip_uri_patterns.any? {|pattern| pattern =~ url}
log "Skipping #{url}"
return true
end
if url.length > max_url_length
log "Skipping long url #{url}"
return true
end
end
|
#submit(method, action, data) ⇒ Object
128
129
130
|
# File 'lib/relevance/tarantula/crawler.rb', line 128
def submit(method, action, data)
proxy.send(method, action, data)
end
|
#timeout_if_too_long(number = 0) ⇒ Object
252
253
254
255
256
|
# File 'lib/relevance/tarantula/crawler.rb', line 252
def timeout_if_too_long(number = 0)
if elasped_time_for_pass(number) > crawl_timeout
raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
end
end
|
#total_links_count ⇒ Object
233
234
235
|
# File 'lib/relevance/tarantula/crawler.rb', line 233
def total_links_count
@links_queued.size + @form_signatures_queued.size
end
|
180
181
182
183
184
185
186
187
|
# File 'lib/relevance/tarantula/crawler.rb', line 180
def transform_url(url)
return unless url
url = @decoder.decode(url)
@transform_url_patterns.each do |pattern|
url = pattern[url]
end
url
end
|