Class: Anemone::Core

Inherits:
Object show all
Includes:
Arachni::UI::Output
Defined in:
lib/anemone/core.rb

Overview

Overides Anemone’s Core class method skip_link?( link ) to support regexp matching to the whole url and enforce redundancy checks. <br/> Messages were also added to inform the user in case of redundant URLs.

@author: Tasos “Zapotek” Laskos

<[email protected]>
<[email protected]>

@version: 0.1

Constant Summary collapse

DEFAULT_OPTS =
{
  # run 4 Tentacle threads to fetch pages
  :threads => 4,
  # disable verbose output
  :verbose => false,
  # don't throw away the page response body after scanning it for links
  :discard_page_bodies => false,
  # identify self as Anemone/VERSION
  :user_agent => "Anemone/#{Anemone::VERSION}",
  # no delay between requests
  :delay => 0,
  # don't obey the robots exclusion protocol
  :obey_robots_txt => false,
  # by default, don't limit the depth of the crawl
  :depth_limit => false,
  # number of times HTTP redirects will be followed
  :redirect_limit => 5,
  # storage engine defaults to Hash in +process_options+ if none specified
  :storage => nil,
  # Hash of cookie name => value to send with HTTP requests
  :cookies => nil,
  # accept cookies from the server and send them back?
  :accept_cookies => false,
  # skip any link with a query string? e.g. http://foo.com/?u=user
  :skip_query_strings => false
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Arachni::UI::Output

#buffer, #debug!, #debug?, #flush_buffer, #mute!, #muted?, #only_positives!, #only_positives?, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, #unmute!, #verbose!, #verbose?

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ Core

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

Yields:

  • (_self)

Yield Parameters:

  • _self (Anemone::Core)

    the object that the method was called on



92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/anemone/core.rb', line 92

def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []
  @opts = opts

  yield self if block_given?
end

Instance Attribute Details

#optsObject (readonly)

Hash of options for the crawl



52
53
54
# File 'lib/anemone/core.rb', line 52

def opts
  @opts
end

#pagesObject (readonly)

PageStore storing all Page objects encountered during the crawl



50
51
52
# File 'lib/anemone/core.rb', line 50

def pages
  @pages
end

Class Method Details

.crawl(urls, opts = {}) ⇒ Object

Convenience method to start a new crawl



109
110
111
112
113
114
# File 'lib/anemone/core.rb', line 109

def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end

Instance Method Details

#after_crawl(&block) ⇒ Object

Add a block to be executed on the PageStore after the crawl is finished



120
121
122
123
# File 'lib/anemone/core.rb', line 120

def after_crawl(&block)
  @after_crawl_blocks << block
  self
end

#focus_crawl(&block) ⇒ Object

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.



160
161
162
163
# File 'lib/anemone/core.rb', line 160

def focus_crawl(&block)
  @focus_crawl_block = block
  self
end

#on_every_page(&block) ⇒ Object

Add a block to be executed on every Page as they are encountered during the crawl



138
139
140
141
# File 'lib/anemone/core.rb', line 138

def on_every_page(&block)
  @on_every_page_blocks << block
  self
end

#on_pages_like(*patterns, &block) ⇒ Object

Add a block to be executed on Page objects with a URL matching one or more patterns



147
148
149
150
151
152
153
154
# File 'lib/anemone/core.rb', line 147

def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end

#runObject

Perform the crawl



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/anemone/core.rb', line 168

def run
  process_options

  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?

  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
  end

  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    @pages.touch_key page.url
    puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
    do_page_blocks page
    page.discard_doc! if @opts[:discard_page_bodies]

    links = links_to_follow page
    links.each do |link|
      link_queue << [link, page.url.dup, page.depth + 1]
    end
    @pages.touch_keys links

    @pages[page.url] = page

    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      if page_queue.empty?
        @tentacles.size.times { link_queue << :END }
        break
      end
    end
  end

  @tentacles.each { |thread| thread.join }
  do_after_crawl_blocks
  self
end

Add one ore more Regex patterns for URLs which should not be followed



129
130
131
132
# File 'lib/anemone/core.rb', line 129

def skip_links_like(*patterns)
  @skip_link_patterns.concat [patterns].flatten.compact
  self
end