Class: Relevance::Tarantula::Crawler

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Relevance::Tarantula
Defined in:
lib/relevance/tarantula/crawler.rb

Defined Under Namespace

Classes: CrawlTimeout

Constant Summary

Constants included from Relevance::Tarantula

VERSION

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Relevance::Tarantula

#log, #rails_root, #tarantula_home, #verbose

Constructor Details

#initializeCrawler

Returns a new instance of Crawler.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/relevance/tarantula/crawler.rb', line 21

def initialize
  @max_url_length = 1024
  @successes = []
  @failures = []
  @handlers = [@response_code_handler = Result]
  @links_queued = Set.new
  @form_signatures_queued = Set.new
  @crawl_queue = []
  @crawl_start_times, @crawl_end_times = [], []
  @crawl_timeout = 20.minutes
  @referrers = {}
  @skip_uri_patterns = [
    /^javascript/,
    /^mailto/,
    /^http/,
  ]
  self.transform_url_patterns = [
    [/#.*$/, '']
  ]
  @reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
  @decoder = HTMLEntities.new
  @times_to_crawl = 1
  @fuzzers = [Relevance::Tarantula::FormSubmission]

  @stdout_tty = $stdout.tty?
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(meth, *args) ⇒ Object



48
49
50
51
# File 'lib/relevance/tarantula/crawler.rb', line 48

def method_missing(meth, *args)
  super unless Result::ALLOW_NNN_FOR =~ meth.to_s
  @response_code_handler.send(meth, *args)
end

Instance Attribute Details

#crawl_end_timesObject (readonly)

Returns the value of attribute crawl_end_times.



19
20
21
# File 'lib/relevance/tarantula/crawler.rb', line 19

def crawl_end_times
  @crawl_end_times
end

#crawl_queueObject

Returns the value of attribute crawl_queue.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def crawl_queue
  @crawl_queue
end

#crawl_start_timesObject (readonly)

Returns the value of attribute crawl_start_times.



19
20
21
# File 'lib/relevance/tarantula/crawler.rb', line 19

def crawl_start_times
  @crawl_start_times
end

#crawl_timeoutObject

Returns the value of attribute crawl_timeout.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def crawl_timeout
  @crawl_timeout
end

#failuresObject (readonly)

Returns the value of attribute failures.



19
20
21
# File 'lib/relevance/tarantula/crawler.rb', line 19

def failures
  @failures
end

#form_signatures_queuedObject

Returns the value of attribute form_signatures_queued.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def form_signatures_queued
  @form_signatures_queued
end

#fuzzersObject

Returns the value of attribute fuzzers.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def fuzzers
  @fuzzers
end

#handlersObject

Returns the value of attribute handlers.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def handlers
  @handlers
end

Returns the value of attribute links_queued.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def links_queued
  @links_queued
end

#log_grabberObject

Returns the value of attribute log_grabber.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def log_grabber
  @log_grabber
end

#max_url_lengthObject

Returns the value of attribute max_url_length.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def max_url_length
  @max_url_length
end

#proxyObject

Returns the value of attribute proxy.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def proxy
  @proxy
end

#referrersObject (readonly)

Returns the value of attribute referrers.



19
20
21
# File 'lib/relevance/tarantula/crawler.rb', line 19

def referrers
  @referrers
end

#reportersObject

Returns the value of attribute reporters.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def reporters
  @reporters
end

#response_code_handlerObject

Returns the value of attribute response_code_handler.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def response_code_handler
  @response_code_handler
end

#skip_uri_patternsObject

Returns the value of attribute skip_uri_patterns.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def skip_uri_patterns
  @skip_uri_patterns
end

#successesObject (readonly)

Returns the value of attribute successes.



19
20
21
# File 'lib/relevance/tarantula/crawler.rb', line 19

def successes
  @successes
end

#test_nameObject

Returns the value of attribute test_name.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def test_name
  @test_name
end

#times_to_crawlObject

Returns the value of attribute times_to_crawl.



15
16
17
# File 'lib/relevance/tarantula/crawler.rb', line 15

def times_to_crawl
  @times_to_crawl
end

#transform_url_patternsObject

Returns the value of attribute transform_url_patterns.



19
20
21
# File 'lib/relevance/tarantula/crawler.rb', line 19

def transform_url_patterns
  @transform_url_patterns
end

Instance Method Details

#blip(number = 0) ⇒ Object



245
246
247
248
249
250
# File 'lib/relevance/tarantula/crawler.rb', line 245

def blip(number = 0)
  unless verbose
    print "\r #{links_completed_count} of #{total_links_count} links completed               " if @stdout_tty
    timeout_if_too_long(number)
  end
end

#crawl(url = "/") ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/relevance/tarantula/crawler.rb', line 59

def crawl(url = "/")
  orig_links_queued = @links_queued.dup
  orig_form_signatures_queued = @form_signatures_queued.dup
  orig_crawl_queue = @crawl_queue.dup
  @times_to_crawl.times do |num|
    queue_link url

    begin 
      do_crawl num
    rescue CrawlTimeout => e
      puts
      puts e.message
    end

    puts "#{ActiveSupport::Inflector.ordinalize((num+1))} crawl" if @times_to_crawl > 1

    if num + 1 < @times_to_crawl
      @links_queued = orig_links_queued
      @form_signatures_queued = orig_form_signatures_queued
      @crawl_queue = orig_crawl_queue
      @referrers = {}
    end
  end
rescue Interrupt
  $stderr.puts "CTRL-C"
ensure
  report_results
end

#crawl_the_queue(number = 0) ⇒ Object



100
101
102
103
104
105
# File 'lib/relevance/tarantula/crawler.rb', line 100

def crawl_the_queue(number = 0)
  while (request = @crawl_queue.pop)
    request.crawl
    blip(number)
  end
end

#do_crawl(number) ⇒ Object



92
93
94
95
96
97
98
# File 'lib/relevance/tarantula/crawler.rb', line 92

def do_crawl(number)
  while (!finished?)
    @crawl_start_times << Time.now
    crawl_the_queue(number)
    @crawl_end_times << Time.now
  end
end

#elasped_time_for_pass(num) ⇒ Object



132
133
134
# File 'lib/relevance/tarantula/crawler.rb', line 132

def elasped_time_for_pass(num)
  Time.now - crawl_start_times[num]
end

#finished?Boolean

Returns:

  • (Boolean)


88
89
90
# File 'lib/relevance/tarantula/crawler.rb', line 88

def finished?
  @crawl_queue.empty?
end

#follow(method, url, data = nil) ⇒ Object



124
125
126
# File 'lib/relevance/tarantula/crawler.rb', line 124

def follow(method, url, data=nil)
  proxy.send(method, url, data)
end

#generate_reportsObject



214
215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/relevance/tarantula/crawler.rb', line 214

def generate_reports
  errors = []
  reporters.each do |reporter|
    begin
      reporter.finish_report(test_name)
    rescue RuntimeError => e
      errors << e
    end
  end
  unless errors.empty?
    raise errors.map(&:message).join("\n")
  end
end

#grab_log!Object



136
137
138
# File 'lib/relevance/tarantula/crawler.rb', line 136

def grab_log!
  @log_grabber && @log_grabber.grab!
end

#handle_form_results(form, response) ⇒ Object



148
149
150
151
152
153
154
155
156
157
158
# File 'lib/relevance/tarantula/crawler.rb', line 148

def handle_form_results(form, response)
  handlers.each do |h|
    save_result h.handle(Result.new(:method => form.method,
                                    :url => form.action,
                                    :response => response,
                                    :log => grab_log!,
                                    :referrer => form.action,
                                    :data => form.data.inspect,
                                    :test_name => test_name).freeze)
  end
end


113
114
115
116
117
118
119
120
121
122
# File 'lib/relevance/tarantula/crawler.rb', line 113

def handle_link_results(link, result)
  handlers.each do |h|
    begin
      save_result h.handle(result)
    rescue Exception => e
      log "error handling #{link} #{e.message}"
      # TODO: pass to results
    end
  end
end


241
242
243
# File 'lib/relevance/tarantula/crawler.rb', line 241

def links_completed_count
  total_links_count - links_remaining_count
end


237
238
239
# File 'lib/relevance/tarantula/crawler.rb', line 237

def links_remaining_count
  @crawl_queue.size
end

#make_result(options) ⇒ Object



140
141
142
143
144
145
146
# File 'lib/relevance/tarantula/crawler.rb', line 140

def make_result(options)
  defaults = {
    :log       => grab_log!,
    :test_name => test_name      
  }
  Result.new(defaults.merge(options)).freeze
end

#queue_form(form, referrer = nil) ⇒ Object



197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/relevance/tarantula/crawler.rb', line 197

def queue_form(form, referrer = nil)
  fuzzers.each do |fuzzer|
    fuzzer.mutate(Form.new(form, self, referrer)).each do |fs|
      # fs = fuzzer.new(Form.new(form, self, referrer))
      fs.action = transform_url(fs.action)
      return if should_skip_form_submission?(fs)
      @referrers[fs.action] = referrer if referrer
      @crawl_queue << fs
      @form_signatures_queued << fs.signature
    end
  end
end


189
190
191
192
193
194
195
# File 'lib/relevance/tarantula/crawler.rb', line 189

def queue_link(dest, referrer = nil)
  dest = Link.new(dest, self, referrer)
  return if should_skip_link?(dest)
  @crawl_queue << dest
  @links_queued << dest
  dest
end

#report_dirObject



210
211
212
# File 'lib/relevance/tarantula/crawler.rb', line 210

def report_dir
  File.join(rails_root, "tmp", "tarantula")
end

#report_resultsObject



228
229
230
231
# File 'lib/relevance/tarantula/crawler.rb', line 228

def report_results
  puts "Crawled #{total_links_count} links and forms."
  generate_reports
end

#save_result(result) ⇒ Object



107
108
109
110
111
# File 'lib/relevance/tarantula/crawler.rb', line 107

def save_result(result)
  reporters.each do |reporter|
    reporter.report(result)
  end
end

#should_skip_form_submission?(fs) ⇒ Boolean

Returns:

  • (Boolean)


176
177
178
# File 'lib/relevance/tarantula/crawler.rb', line 176

def should_skip_form_submission?(fs)
  should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end

#should_skip_link?(link) ⇒ Boolean

Returns:

  • (Boolean)


172
173
174
# File 'lib/relevance/tarantula/crawler.rb', line 172

def should_skip_link?(link)
  should_skip_url?(link.href) || @links_queued.member?(link)
end

#should_skip_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


160
161
162
163
164
165
166
167
168
169
170
# File 'lib/relevance/tarantula/crawler.rb', line 160

def should_skip_url?(url)
  return true if url.blank?
  if @skip_uri_patterns.any? {|pattern| pattern =~ url}
    log "Skipping #{url}"
    return true
  end
  if url.length > max_url_length
    log "Skipping long url #{url}"
    return true
  end
end

#submit(method, action, data) ⇒ Object



128
129
130
# File 'lib/relevance/tarantula/crawler.rb', line 128

def submit(method, action, data)
  proxy.send(method, action, data)
end

#timeout_if_too_long(number = 0) ⇒ Object



252
253
254
255
256
# File 'lib/relevance/tarantula/crawler.rb', line 252

def timeout_if_too_long(number = 0)
  if elasped_time_for_pass(number) > crawl_timeout
    raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
  end
end


233
234
235
# File 'lib/relevance/tarantula/crawler.rb', line 233

def total_links_count
  @links_queued.size + @form_signatures_queued.size
end

#transform_url(url) ⇒ Object



180
181
182
183
184
185
186
187
# File 'lib/relevance/tarantula/crawler.rb', line 180

def transform_url(url)
  return unless url
  url = @decoder.decode(url)
  @transform_url_patterns.each do |pattern|
    url = pattern[url]
  end
  url
end