Class: Relevance::Tarantula::Crawler
Constant Summary
VERSION
Instance Attribute Summary collapse
Instance Method Summary
collapse
#log, #rails_root, #tarantula_home, #verbose
Constructor Details
Returns a new instance of Crawler.
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
# File 'lib/relevance/tarantula/crawler.rb', line 14
def initialize
@max_url_length = 1024
@successes = []
@failures = []
@handlers = [@response_code_handler = Result]
@links_queued = Set.new
@form_signatures_queued = Set.new
@links_to_crawl = []
@forms_to_crawl = []
@referrers = {}
@skip_uri_patterns = [
/^javascript/,
/^mailto/,
/^http/,
]
self.transform_url_patterns = [
[/#.*$/, '']
]
@reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
@decoder = HTMLEntities.new
@times_to_crawl = 1
@fuzzers = [Relevance::Tarantula::FormSubmission]
end
|
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
#method_missing(meth, *args) ⇒ Object
38
39
40
41
|
# File 'lib/relevance/tarantula/crawler.rb', line 38
def method_missing(meth, *args)
super unless Result::ALLOW_NNN_FOR =~ meth.to_s
@response_code_handler.send(meth, *args)
end
|
Instance Attribute Details
Returns the value of attribute failures.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def failures
@failures
end
|
Returns the value of attribute form_signatures_queued.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def form_signatures_queued
@form_signatures_queued
end
|
Returns the value of attribute forms_to_crawl.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def forms_to_crawl
@forms_to_crawl
end
|
Returns the value of attribute fuzzers.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def fuzzers
@fuzzers
end
|
Returns the value of attribute handlers.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def handlers
@handlers
end
|
#links_queued ⇒ Object
Returns the value of attribute links_queued.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def links_queued
@links_queued
end
|
#links_to_crawl ⇒ Object
Returns the value of attribute links_to_crawl.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def links_to_crawl
@links_to_crawl
end
|
#log_grabber ⇒ Object
Returns the value of attribute log_grabber.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def log_grabber
@log_grabber
end
|
#max_url_length ⇒ Object
Returns the value of attribute max_url_length.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def max_url_length
@max_url_length
end
|
Returns the value of attribute proxy.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def proxy
@proxy
end
|
#referrers ⇒ Object
Returns the value of attribute referrers.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def referrers
@referrers
end
|
#reporters ⇒ Object
Returns the value of attribute reporters.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def reporters
@reporters
end
|
#response_code_handler ⇒ Object
Returns the value of attribute response_code_handler.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def response_code_handler
@response_code_handler
end
|
#skip_uri_patterns ⇒ Object
Returns the value of attribute skip_uri_patterns.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def skip_uri_patterns
@skip_uri_patterns
end
|
#successes ⇒ Object
Returns the value of attribute successes.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def successes
@successes
end
|
#test_name ⇒ Object
Returns the value of attribute test_name.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def test_name
@test_name
end
|
#times_to_crawl ⇒ Object
Returns the value of attribute times_to_crawl.
8
9
10
|
# File 'lib/relevance/tarantula/crawler.rb', line 8
def times_to_crawl
@times_to_crawl
end
|
Returns the value of attribute transform_url_patterns.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def transform_url_patterns
@transform_url_patterns
end
|
Instance Method Details
235
236
237
238
239
|
# File 'lib/relevance/tarantula/crawler.rb', line 235
def blip
unless verbose
print "\r #{links_completed_count} of #{total_links_count} links completed "
end
end
|
#crawl(url = "/") ⇒ Object
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
# File 'lib/relevance/tarantula/crawler.rb', line 49
def crawl(url = "/")
orig_links_queued = @links_queued.dup
orig_form_signatures_queued = @form_signatures_queued.dup
orig_links_to_crawl = @links_to_crawl.dup
orig_forms_to_crawl = @forms_to_crawl.dup
@times_to_crawl.times do |i|
queue_link url
do_crawl
puts "#{(i+1).ordinalize} crawl" if @times_to_crawl > 1
if i + 1 < @times_to_crawl
@links_queued = orig_links_queued
@form_signatures_queued = orig_form_signatures_queued
@links_to_crawl = orig_links_to_crawl
@forms_to_crawl = orig_forms_to_crawl
@referrers = {}
end
end
rescue Interrupt
$stderr.puts "CTRL-C"
ensure
report_results
end
|
116
117
118
119
120
121
122
123
|
# File 'lib/relevance/tarantula/crawler.rb', line 116
def crawl_form(form)
response = proxy.send(form.method, form.action, form.data)
log "Response #{response.code} for #{form}"
response
rescue ActiveRecord::RecordNotFound => e
log "Skipping #{form.action}, presumed ok that record is missing"
Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
end
|
125
126
127
128
129
130
131
|
# File 'lib/relevance/tarantula/crawler.rb', line 125
def crawl_queued_forms
while (form = @forms_to_crawl.pop)
response = crawl_form(form)
handle_form_results(form, response)
blip
end
end
|
#crawl_queued_links ⇒ Object
85
86
87
88
89
90
91
92
|
# File 'lib/relevance/tarantula/crawler.rb', line 85
def crawl_queued_links
while (link = @links_to_crawl.pop)
response = proxy.send(link.method, link.href)
log "Response #{response.code} for #{link}"
handle_link_results(link, response)
blip
end
end
|
78
79
80
81
82
83
|
# File 'lib/relevance/tarantula/crawler.rb', line 78
def do_crawl
while (!finished?)
crawl_queued_links
crawl_queued_forms
end
end
|
#finished? ⇒ Boolean
74
75
76
|
# File 'lib/relevance/tarantula/crawler.rb', line 74
def finished?
@links_to_crawl.empty? && @forms_to_crawl.empty?
end
|
#generate_reports ⇒ Object
205
206
207
208
209
210
211
212
213
214
215
216
217
|
# File 'lib/relevance/tarantula/crawler.rb', line 205
def generate_reports
errors = []
reporters.each do |reporter|
begin
reporter.finish_report(test_name)
rescue RuntimeError => e
errors << e
end
end
unless errors.empty?
raise errors.map(&:message).join("\n")
end
end
|
#grab_log! ⇒ Object
133
134
135
|
# File 'lib/relevance/tarantula/crawler.rb', line 133
def grab_log!
@log_grabber && @log_grabber.grab!
end
|
137
138
139
140
141
142
143
144
145
146
147
|
# File 'lib/relevance/tarantula/crawler.rb', line 137
def handle_form_results(form, response)
handlers.each do |h|
save_result h.handle(Result.new(:method => form.method,
:url => form.action,
:response => response,
:log => grab_log!,
:referrer => form.action,
:data => form.data.inspect,
:test_name => test_name).freeze)
end
end
|
#handle_link_results(link, response) ⇒ Object
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
# File 'lib/relevance/tarantula/crawler.rb', line 100
def handle_link_results(link, response)
handlers.each do |h|
begin
save_result h.handle(Result.new(:method => link.method,
:url => link.href,
:response => response,
:log => grab_log!,
:referrer => referrers[link],
:test_name => test_name).freeze)
rescue Exception => e
log "error handling #{link} #{e.message}"
end
end
end
|
#links_completed_count ⇒ Object
231
232
233
|
# File 'lib/relevance/tarantula/crawler.rb', line 231
def links_completed_count
total_links_count - links_remaining_count
end
|
#links_remaining_count ⇒ Object
227
228
229
|
# File 'lib/relevance/tarantula/crawler.rb', line 227
def links_remaining_count
@links_to_crawl.size + @forms_to_crawl.size
end
|
188
189
190
191
192
193
194
195
196
197
198
199
|
# File 'lib/relevance/tarantula/crawler.rb', line 188
def queue_form(form, referrer = nil)
fuzzers.each do |fuzzer|
fuzzer.mutate(Form.new(form)).each do |fs|
fs.action = transform_url(fs.action)
return if should_skip_form_submission?(fs)
@referrers[fs.action] = referrer if referrer
@forms_to_crawl << fs
@form_signatures_queued << fs.signature
end
end
end
|
#queue_link(dest, referrer = nil) ⇒ Object
178
179
180
181
182
183
184
185
186
|
# File 'lib/relevance/tarantula/crawler.rb', line 178
def queue_link(dest, referrer = nil)
dest = Link.new(dest)
dest.href = transform_url(dest.href)
return if should_skip_link?(dest)
@referrers[dest] = referrer if referrer
@links_to_crawl << dest
@links_queued << dest
dest
end
|
#report_dir ⇒ Object
201
202
203
|
# File 'lib/relevance/tarantula/crawler.rb', line 201
def report_dir
File.join(rails_root, "tmp", "tarantula")
end
|
#report_results ⇒ Object
219
220
221
|
# File 'lib/relevance/tarantula/crawler.rb', line 219
def report_results
generate_reports
end
|
#save_result(result) ⇒ Object
94
95
96
97
98
|
# File 'lib/relevance/tarantula/crawler.rb', line 94
def save_result(result)
reporters.each do |reporter|
reporter.report(result)
end
end
|
165
166
167
|
# File 'lib/relevance/tarantula/crawler.rb', line 165
def should_skip_form_submission?(fs)
should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end
|
#should_skip_link?(link) ⇒ Boolean
161
162
163
|
# File 'lib/relevance/tarantula/crawler.rb', line 161
def should_skip_link?(link)
should_skip_url?(link.href) || @links_queued.member?(link)
end
|
#should_skip_url?(url) ⇒ Boolean
149
150
151
152
153
154
155
156
157
158
159
|
# File 'lib/relevance/tarantula/crawler.rb', line 149
def should_skip_url?(url)
return true if url.blank?
if @skip_uri_patterns.any? {|pattern| pattern =~ url}
log "Skipping #{url}"
return true
end
if url.length > max_url_length
log "Skipping long url #{url}"
return true
end
end
|
#total_links_count ⇒ Object
223
224
225
|
# File 'lib/relevance/tarantula/crawler.rb', line 223
def total_links_count
@links_queued.size + @form_signatures_queued.size
end
|
169
170
171
172
173
174
175
176
|
# File 'lib/relevance/tarantula/crawler.rb', line 169
def transform_url(url)
return unless url
url = @decoder.decode(url)
@transform_url_patterns.each do |pattern|
url = pattern[url]
end
url
end
|