Class: Relevance::Tarantula::Crawler

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Relevance::Tarantula
Defined in:
lib/relevance/tarantula/crawler.rb

Defined Under Namespace

Classes: CrawlTimeout

Constant Summary

Constants included from Relevance::Tarantula

VERSION

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Relevance::Tarantula

#log, #rails_root, #tarantula_home, #verbose

Constructor Details

#initializeCrawler

Returns a new instance of Crawler.



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/relevance/tarantula/crawler.rb', line 22

def initialize
  @max_url_length = 1024
  @successes = []
  @failures = []
  @handlers = [@response_code_handler = Result]
  @links_queued = Set.new
  @form_signatures_queued = Set.new
  @crawl_queue = []
  @crawl_start_times, @crawl_end_times = [], []
  @crawl_timeout = 20.minutes
  @referrers = {}
  @skip_uri_patterns = [
    /^javascript/,
    /^mailto/,
    /^http/,
  ]
  self.transform_url_patterns = [
    [/#.*$/, '']
  ]
  @reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
  @decoder = HTMLEntities.new
  @times_to_crawl = 1
  @fuzzers = [Relevance::Tarantula::FormSubmission]

  @stdout_tty = $stdout.tty?
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(meth, *args) ⇒ Object



49
50
51
52
# File 'lib/relevance/tarantula/crawler.rb', line 49

def method_missing(meth, *args)
  super unless Result::ALLOW_NNN_FOR =~ meth.to_s
  @response_code_handler.send(meth, *args)
end

Instance Attribute Details

#crawl_end_timesObject (readonly)

Returns the value of attribute crawl_end_times.



20
21
22
# File 'lib/relevance/tarantula/crawler.rb', line 20

def crawl_end_times
  @crawl_end_times
end

#crawl_queueObject

Returns the value of attribute crawl_queue.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def crawl_queue
  @crawl_queue
end

#crawl_start_timesObject (readonly)

Returns the value of attribute crawl_start_times.



20
21
22
# File 'lib/relevance/tarantula/crawler.rb', line 20

def crawl_start_times
  @crawl_start_times
end

#crawl_timeoutObject

Returns the value of attribute crawl_timeout.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def crawl_timeout
  @crawl_timeout
end

#failuresObject (readonly)

Returns the value of attribute failures.



20
21
22
# File 'lib/relevance/tarantula/crawler.rb', line 20

def failures
  @failures
end

#form_signatures_queuedObject

Returns the value of attribute form_signatures_queued.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def form_signatures_queued
  @form_signatures_queued
end

#fuzzersObject

Returns the value of attribute fuzzers.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def fuzzers
  @fuzzers
end

#handlersObject

Returns the value of attribute handlers.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def handlers
  @handlers
end

Returns the value of attribute links_queued.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def links_queued
  @links_queued
end

#log_grabberObject

Returns the value of attribute log_grabber.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def log_grabber
  @log_grabber
end

#max_url_lengthObject

Returns the value of attribute max_url_length.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def max_url_length
  @max_url_length
end

#proxyObject

Returns the value of attribute proxy.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def proxy
  @proxy
end

#referrersObject (readonly)

Returns the value of attribute referrers.



20
21
22
# File 'lib/relevance/tarantula/crawler.rb', line 20

def referrers
  @referrers
end

#reportersObject

Returns the value of attribute reporters.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def reporters
  @reporters
end

#response_code_handlerObject

Returns the value of attribute response_code_handler.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def response_code_handler
  @response_code_handler
end

#skip_uri_patternsObject

Returns the value of attribute skip_uri_patterns.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def skip_uri_patterns
  @skip_uri_patterns
end

#successesObject (readonly)

Returns the value of attribute successes.



20
21
22
# File 'lib/relevance/tarantula/crawler.rb', line 20

def successes
  @successes
end

#test_nameObject

Returns the value of attribute test_name.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def test_name
  @test_name
end

#times_to_crawlObject

Returns the value of attribute times_to_crawl.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def times_to_crawl
  @times_to_crawl
end

#transform_url_patternsObject

Returns the value of attribute transform_url_patterns.



20
21
22
# File 'lib/relevance/tarantula/crawler.rb', line 20

def transform_url_patterns
  @transform_url_patterns
end

Instance Method Details

#append_to_queue(request) ⇒ Object



211
212
213
# File 'lib/relevance/tarantula/crawler.rb', line 211

def append_to_queue(request)
  @crawl_queue.insert(index_to_insert(request), request)
end

#blip(number = 0) ⇒ Object



264
265
266
267
268
269
# File 'lib/relevance/tarantula/crawler.rb', line 264

def blip(number = 0)
  unless verbose
    print "\r #{links_completed_count} of #{total_links_count} links completed               " if @stdout_tty
    timeout_if_too_long(number)
  end
end

#crawl(url = "/") ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/relevance/tarantula/crawler.rb', line 60

def crawl(url = "/")
  orig_links_queued = @links_queued.dup
  orig_form_signatures_queued = @form_signatures_queued.dup
  orig_crawl_queue = @crawl_queue.dup
  @times_to_crawl.times do |num|
    queue_link url

    begin
      do_crawl num
    rescue CrawlTimeout => e
      puts
      puts e.message
    end

    puts "#{ActiveSupport::Inflector.ordinalize((num+1))} crawl" if @times_to_crawl > 1

    if num + 1 < @times_to_crawl
      @links_queued = orig_links_queued
      @form_signatures_queued = orig_form_signatures_queued
      @crawl_queue = orig_crawl_queue
      @referrers = {}
    end
  end
rescue Interrupt
  $stderr.puts "CTRL-C"
ensure
  report_results
end

#crawl_the_queue(number = 0) ⇒ Object



101
102
103
104
105
106
# File 'lib/relevance/tarantula/crawler.rb', line 101

def crawl_the_queue(number = 0)
  while (request = @crawl_queue.shift)
    request.crawl
    blip(number)
  end
end

#do_crawl(number) ⇒ Object



93
94
95
96
97
98
99
# File 'lib/relevance/tarantula/crawler.rb', line 93

def do_crawl(number)
  while (!finished?)
    @crawl_start_times << Time.now
    crawl_the_queue(number)
    @crawl_end_times << Time.now
  end
end

#elasped_time_for_pass(num) ⇒ Object



133
134
135
# File 'lib/relevance/tarantula/crawler.rb', line 133

def elasped_time_for_pass(num)
  Time.now - crawl_start_times[num]
end

#finished?Boolean

Returns:

  • (Boolean)


89
90
91
# File 'lib/relevance/tarantula/crawler.rb', line 89

def finished?
  @crawl_queue.empty?
end

#follow(method, url, data = nil) ⇒ Object



125
126
127
# File 'lib/relevance/tarantula/crawler.rb', line 125

def follow(method, url, data=nil)
  proxy.send(method, url, data)
end

#generate_reportsObject



233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/relevance/tarantula/crawler.rb', line 233

def generate_reports
  errors = []
  reporters.each do |reporter|
    begin
      reporter.finish_report(test_name)
    rescue RuntimeError => e
      errors << e
    end
  end
  unless errors.empty?
    raise errors.map(&:message).join("\n")
  end
end

#grab_log!Object



137
138
139
# File 'lib/relevance/tarantula/crawler.rb', line 137

def grab_log!
  @log_grabber && @log_grabber.grab!
end

#handle_form_results(form, response) ⇒ Object



149
150
151
152
153
154
155
156
157
158
159
# File 'lib/relevance/tarantula/crawler.rb', line 149

def handle_form_results(form, response)
  handlers.each do |h|
    save_result h.handle(Result.new(:method => form.meth,
                                    :url => form.action,
                                    :response => response,
                                    :log => grab_log!,
                                    :referrer => form.action,
                                    :data => form.data.inspect,
                                    :test_name => test_name).freeze)
  end
end


114
115
116
117
118
119
120
121
122
123
# File 'lib/relevance/tarantula/crawler.rb', line 114

def handle_link_results(link, result)
  handlers.each do |h|
    begin
      save_result h.handle(result)
    rescue Exception => e
      log "error handling #{link} #{e.message}"
      # TODO: pass to results
    end
  end
end

#index_to_insert(request) ⇒ Object

append get requests before others, delete requests to the end of the queue, all others just before the first delete request



217
218
219
220
221
222
223
224
225
226
227
# File 'lib/relevance/tarantula/crawler.rb', line 217

def index_to_insert(request)
  case request.meth
  when 'get'
    last_get = @crawl_queue.rindex { |r| r.meth == 'get' } || -1
    last_get + 1
  when 'delete'
    @crawl_queue.index {|r| r.meth == 'delete' && request.url.start_with?(r.url) } || -1
  else
    @crawl_queue.index {|r| r.meth == 'delete' } || -1
  end
end


260
261
262
# File 'lib/relevance/tarantula/crawler.rb', line 260

def links_completed_count
  total_links_count - links_remaining_count
end


256
257
258
# File 'lib/relevance/tarantula/crawler.rb', line 256

def links_remaining_count
  @crawl_queue.size
end

#make_result(options) ⇒ Object



141
142
143
144
145
146
147
# File 'lib/relevance/tarantula/crawler.rb', line 141

def make_result(options)
  defaults = {
    :log       => grab_log!,
    :test_name => test_name
  }
  Result.new(defaults.merge(options)).freeze
end

#queue_form(form, referrer = nil) ⇒ Object



198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/relevance/tarantula/crawler.rb', line 198

def queue_form(form, referrer = nil)
  fuzzers.each do |fuzzer|
    fuzzer.mutate(Form.new(form, self, referrer)).each do |fs|
      # fs = fuzzer.new(Form.new(form, self, referrer))
      fs.action = transform_url(fs.action)
      return if should_skip_form_submission?(fs)
      @referrers[fs.action] = referrer if referrer
      append_to_queue(fs)
      @form_signatures_queued << fs.signature
    end
  end
end


190
191
192
193
194
195
196
# File 'lib/relevance/tarantula/crawler.rb', line 190

def queue_link(dest, referrer = nil)
  dest = Link.new(dest, self, referrer)
  return if should_skip_link?(dest)
  append_to_queue(dest)
  @links_queued << dest
  dest
end

#report_dirObject



229
230
231
# File 'lib/relevance/tarantula/crawler.rb', line 229

def report_dir
  File.join(rails_root, "tmp", "tarantula")
end

#report_resultsObject



247
248
249
250
# File 'lib/relevance/tarantula/crawler.rb', line 247

def report_results
  puts "Crawled #{total_links_count} links and forms."
  generate_reports
end

#save_result(result) ⇒ Object



108
109
110
111
112
# File 'lib/relevance/tarantula/crawler.rb', line 108

def save_result(result)
  reporters.each do |reporter|
    reporter.report(result)
  end
end

#should_skip_form_submission?(fs) ⇒ Boolean

Returns:

  • (Boolean)


177
178
179
# File 'lib/relevance/tarantula/crawler.rb', line 177

def should_skip_form_submission?(fs)
  should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end

#should_skip_link?(link) ⇒ Boolean

Returns:

  • (Boolean)


173
174
175
# File 'lib/relevance/tarantula/crawler.rb', line 173

def should_skip_link?(link)
  should_skip_url?(link.href) || @links_queued.member?(link)
end

#should_skip_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


161
162
163
164
165
166
167
168
169
170
171
# File 'lib/relevance/tarantula/crawler.rb', line 161

def should_skip_url?(url)
  return true if url.blank?
  if @skip_uri_patterns.any? {|pattern| pattern =~ url}
    log "Skipping #{url}"
    return true
  end
  if url.length > max_url_length
    log "Skipping long url #{url}"
    return true
  end
end

#submit(method, action, data) ⇒ Object



129
130
131
# File 'lib/relevance/tarantula/crawler.rb', line 129

def submit(method, action, data)
  proxy.send(method, action, data)
end

#timeout_if_too_long(number = 0) ⇒ Object



271
272
273
274
275
# File 'lib/relevance/tarantula/crawler.rb', line 271

def timeout_if_too_long(number = 0)
  if elasped_time_for_pass(number) > crawl_timeout
    raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
  end
end


252
253
254
# File 'lib/relevance/tarantula/crawler.rb', line 252

def total_links_count
  @links_queued.size + @form_signatures_queued.size
end

#transform_url(url) ⇒ Object



181
182
183
184
185
186
187
188
# File 'lib/relevance/tarantula/crawler.rb', line 181

def transform_url(url)
  return unless url
  url = @decoder.decode(url)
  @transform_url_patterns.each do |pattern|
    url = pattern[url]
  end
  url
end