Class: HttpSpell::Spider

Inherits:
Object
  • Object
show all
Defined in:
lib/httpspell/spider.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false) ⇒ Spider

Returns a new instance of Spider.



11
12
13
14
15
16
17
18
19
# File 'lib/httpspell/spider.rb', line 11

def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
  @todo = []
  @done = []
  todo << Addressable::URI.parse(starting_point)
  @whitelist = whitelist || [/^#{starting_point}/]
  @blacklist = blacklist
  @verbose = verbose
  @tracing = tracing
end

Instance Attribute Details

#doneObject (readonly)

Returns the value of attribute done.



9
10
11
# File 'lib/httpspell/spider.rb', line 9

def done
  @done
end

#todoObject (readonly)

Returns the value of attribute todo.



9
10
11
# File 'lib/httpspell/spider.rb', line 9

def todo
  @todo
end

Instance Method Details

#startObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/httpspell/spider.rb', line 21

def start
  success = true

  while todo.any?
    url = todo.pop

    begin
      extracted = links(url) do |u, d|
        yield u, d if block_given?
      rescue
        warn "Callback error for #{url}: #{$ERROR_INFO}"
        warn $ERROR_INFO.backtrace if @tracing
      end

      done.append(url)
      todo.concat(extracted - done - todo)
    rescue StandardError
      warn "Skipping #{url} because of #{$ERROR_INFO.message}"
      warn $ERROR_INFO.backtrace if @tracing
      success = false
    end
  end

  return success
end