Class: Relevance::Tarantula::Crawler

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Relevance::Tarantula
Defined in:
lib/relevance/tarantula/crawler.rb

Constant Summary

Constants included from Relevance::Tarantula

VERSION

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Relevance::Tarantula

#log, #rails_root, #tarantula_home, #verbose

Constructor Details

#initializeCrawler

Returns a new instance of Crawler.



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/relevance/tarantula/crawler.rb', line 14

def initialize
  @max_url_length = 1024
  @successes = []
  @failures = []
  @handlers = [@response_code_handler = Result]
  @links_queued = Set.new
  @form_signatures_queued = Set.new
  @links_to_crawl = []
  @forms_to_crawl = []
  @referrers = {}
  @skip_uri_patterns = [
    /^javascript/,
    /^mailto/,
    /^http/,
  ]
  self.transform_url_patterns = [
    [/#.*$/, '']
  ]
  @reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
  @decoder = HTMLEntities.new
  @times_to_crawl = 1
  @fuzzers = [Relevance::Tarantula::FormSubmission]
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(meth, *args) ⇒ Object



38
39
40
41
# File 'lib/relevance/tarantula/crawler.rb', line 38

def method_missing(meth, *args)
  super unless Result::ALLOW_NNN_FOR =~ meth.to_s
  @response_code_handler.send(meth, *args)
end

Instance Attribute Details

#failuresObject (readonly)

Returns the value of attribute failures.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def failures
  @failures
end

#form_signatures_queuedObject

Returns the value of attribute form_signatures_queued.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def form_signatures_queued
  @form_signatures_queued
end

#forms_to_crawlObject

Returns the value of attribute forms_to_crawl.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def forms_to_crawl
  @forms_to_crawl
end

#fuzzersObject

Returns the value of attribute fuzzers.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def fuzzers
  @fuzzers
end

#handlersObject

Returns the value of attribute handlers.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def handlers
  @handlers
end

Returns the value of attribute links_queued.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def links_queued
  @links_queued
end

Returns the value of attribute links_to_crawl.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def links_to_crawl
  @links_to_crawl
end

#log_grabberObject

Returns the value of attribute log_grabber.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def log_grabber
  @log_grabber
end

#max_url_lengthObject

Returns the value of attribute max_url_length.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def max_url_length
  @max_url_length
end

#proxyObject

Returns the value of attribute proxy.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def proxy
  @proxy
end

#referrersObject (readonly)

Returns the value of attribute referrers.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def referrers
  @referrers
end

#reportersObject

Returns the value of attribute reporters.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def reporters
  @reporters
end

#response_code_handlerObject

Returns the value of attribute response_code_handler.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def response_code_handler
  @response_code_handler
end

#skip_uri_patternsObject

Returns the value of attribute skip_uri_patterns.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def skip_uri_patterns
  @skip_uri_patterns
end

#successesObject (readonly)

Returns the value of attribute successes.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def successes
  @successes
end

#test_nameObject

Returns the value of attribute test_name.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def test_name
  @test_name
end

#times_to_crawlObject

Returns the value of attribute times_to_crawl.



8
9
10
# File 'lib/relevance/tarantula/crawler.rb', line 8

def times_to_crawl
  @times_to_crawl
end

#transform_url_patternsObject

Returns the value of attribute transform_url_patterns.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def transform_url_patterns
  @transform_url_patterns
end

Instance Method Details

#blipObject



235
236
237
238
239
# File 'lib/relevance/tarantula/crawler.rb', line 235

def blip
  unless verbose
    print "\r #{links_completed_count} of #{total_links_count} links completed               "
  end
end

#crawl(url = "/") ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/relevance/tarantula/crawler.rb', line 49

def crawl(url = "/")
  orig_links_queued = @links_queued.dup
  orig_form_signatures_queued = @form_signatures_queued.dup
  orig_links_to_crawl = @links_to_crawl.dup
  orig_forms_to_crawl = @forms_to_crawl.dup
  @times_to_crawl.times do |i|
    queue_link url
    do_crawl

    puts "#{(i+1).ordinalize} crawl" if @times_to_crawl > 1

    if i + 1 < @times_to_crawl
      @links_queued = orig_links_queued
      @form_signatures_queued = orig_form_signatures_queued
      @links_to_crawl = orig_links_to_crawl
      @forms_to_crawl = orig_forms_to_crawl
      @referrers = {}
    end
  end
rescue Interrupt
  $stderr.puts "CTRL-C"
ensure
  report_results
end

#crawl_form(form) ⇒ Object



116
117
118
119
120
121
122
123
# File 'lib/relevance/tarantula/crawler.rb', line 116

def crawl_form(form)
  response = proxy.send(form.method, form.action, form.data)
  log "Response #{response.code} for #{form}"
  response
rescue ActiveRecord::RecordNotFound => e
  log "Skipping #{form.action}, presumed ok that record is missing"
  Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
end

#crawl_queued_formsObject



125
126
127
128
129
130
131
# File 'lib/relevance/tarantula/crawler.rb', line 125

def crawl_queued_forms
  while (form = @forms_to_crawl.pop)
    response = crawl_form(form)
    handle_form_results(form, response)
    blip
  end
end


85
86
87
88
89
90
91
92
# File 'lib/relevance/tarantula/crawler.rb', line 85

def crawl_queued_links
  while (link = @links_to_crawl.pop)
    response = proxy.send(link.method, link.href)
    log "Response #{response.code} for #{link}"
    handle_link_results(link, response)
    blip
  end
end

#do_crawlObject



78
79
80
81
82
83
# File 'lib/relevance/tarantula/crawler.rb', line 78

def do_crawl
  while (!finished?)
    crawl_queued_links
    crawl_queued_forms
  end
end

#finished?Boolean

Returns:

  • (Boolean)


74
75
76
# File 'lib/relevance/tarantula/crawler.rb', line 74

def finished?
  @links_to_crawl.empty? && @forms_to_crawl.empty?
end

#generate_reportsObject



205
206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/relevance/tarantula/crawler.rb', line 205

def generate_reports
  errors = []
  reporters.each do |reporter|
    begin
      reporter.finish_report(test_name)
    rescue RuntimeError => e
      errors << e
    end
  end
  unless errors.empty?
    raise errors.map(&:message).join("\n")
  end
end

#grab_log!Object



133
134
135
# File 'lib/relevance/tarantula/crawler.rb', line 133

def grab_log!
  @log_grabber && @log_grabber.grab!
end

#handle_form_results(form, response) ⇒ Object



137
138
139
140
141
142
143
144
145
146
147
# File 'lib/relevance/tarantula/crawler.rb', line 137

def handle_form_results(form, response)
  handlers.each do |h|
    save_result h.handle(Result.new(:method => form.method,
                                   :url => form.action,
                                   :response => response,
                                   :log => grab_log!,
                                   :referrer => form.action,
                                   :data => form.data.inspect,
                                   :test_name => test_name).freeze)
  end
end


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/relevance/tarantula/crawler.rb', line 100

def handle_link_results(link, response)
  handlers.each do |h|
    begin
      save_result h.handle(Result.new(:method => link.method,
                                     :url => link.href,
                                     :response => response,
                                     :log => grab_log!,
                                     :referrer => referrers[link],
                                     :test_name => test_name).freeze)
    rescue Exception => e
      log "error handling #{link} #{e.message}"
      # TODO: pass to results
    end
  end
end


231
232
233
# File 'lib/relevance/tarantula/crawler.rb', line 231

def links_completed_count
    total_links_count - links_remaining_count
end


227
228
229
# File 'lib/relevance/tarantula/crawler.rb', line 227

def links_remaining_count
  @links_to_crawl.size + @forms_to_crawl.size
end

#queue_form(form, referrer = nil) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/relevance/tarantula/crawler.rb', line 188

def queue_form(form, referrer = nil)
  fuzzers.each do |fuzzer|
    fuzzer.mutate(Form.new(form)).each do |fs|
      # fs = fuzzer.new(Form.new(form))
      fs.action = transform_url(fs.action)
      return if should_skip_form_submission?(fs)
      @referrers[fs.action] = referrer if referrer
      @forms_to_crawl << fs
      @form_signatures_queued << fs.signature
    end
  end
end


178
179
180
181
182
183
184
185
186
# File 'lib/relevance/tarantula/crawler.rb', line 178

def queue_link(dest, referrer = nil)
  dest = Link.new(dest)
  dest.href = transform_url(dest.href)
  return if should_skip_link?(dest)
  @referrers[dest] = referrer if referrer
  @links_to_crawl << dest
  @links_queued << dest
  dest
end

#report_dirObject



201
202
203
# File 'lib/relevance/tarantula/crawler.rb', line 201

def report_dir
  File.join(rails_root, "tmp", "tarantula")
end

#report_resultsObject



219
220
221
# File 'lib/relevance/tarantula/crawler.rb', line 219

def report_results
  generate_reports
end

#save_result(result) ⇒ Object



94
95
96
97
98
# File 'lib/relevance/tarantula/crawler.rb', line 94

def save_result(result)
  reporters.each do |reporter|
    reporter.report(result)
  end
end

#should_skip_form_submission?(fs) ⇒ Boolean

Returns:

  • (Boolean)


165
166
167
# File 'lib/relevance/tarantula/crawler.rb', line 165

def should_skip_form_submission?(fs)
  should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end

#should_skip_link?(link) ⇒ Boolean

Returns:

  • (Boolean)


161
162
163
# File 'lib/relevance/tarantula/crawler.rb', line 161

def should_skip_link?(link)
  should_skip_url?(link.href) || @links_queued.member?(link)
end

#should_skip_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


149
150
151
152
153
154
155
156
157
158
159
# File 'lib/relevance/tarantula/crawler.rb', line 149

def should_skip_url?(url)
  return true if url.blank?
  if @skip_uri_patterns.any? {|pattern| pattern =~ url}
    log "Skipping #{url}"
    return true
  end
  if url.length > max_url_length
    log "Skipping long url #{url}"
    return true
  end
end


223
224
225
# File 'lib/relevance/tarantula/crawler.rb', line 223

def total_links_count
  @links_queued.size + @form_signatures_queued.size
end

#transform_url(url) ⇒ Object



169
170
171
172
173
174
175
176
# File 'lib/relevance/tarantula/crawler.rb', line 169

def transform_url(url)
  return unless url
  url = @decoder.decode(url)
  @transform_url_patterns.each do |pattern|
    url = pattern[url]
  end
  url
end