Class: Arachni::Spider

# File 'lib/arachni/spider.rb', line 53

def initialize( opts = Options.instance )
    @opts = opts

    @mutex     = Mutex.new
    @sitemap   = {}
    @redirects = []
    @paths     = Set.new
    @visited   = Support::LookUp::HashSet.new

    @on_each_page_blocks     = []
    @on_each_response_blocks = []
    @on_complete_blocks      = []

    @pass_pages       = true
    @pending_requests = 0

    @retries  = {}
    @failures = []

    seed_paths
end

Instance Attribute Details

#failures ⇒ `Array<String>` (readonly)

Returns URLs that elicited no response from the server. Not determined by HTTP status codes, we’re talking network failures here.

Returns:

(Array<String>) —

URLs that elicited no response from the server. Not determined by HTTP status codes, we’re talking network failures here.



46
47
48

# File 'lib/arachni/spider.rb', line 46

def failures
  @failures
end

#opts ⇒ `Arachni::Options` (readonly)

Returns:

(Arachni::Options)



38
39
40

# File 'lib/arachni/spider.rb', line 38

def opts
  @opts
end

#redirects ⇒ `Array<String>` (readonly)

Returns URLs that caused redirects.

Returns:

(Array<String>) —

URLs that caused redirects



41
42
43

# File 'lib/arachni/spider.rb', line 41

def redirects
  @redirects
end

Instance Method Details

#done? ⇒ `TrueClass`, `FalseClass`

Returns ‘true` if crawl is done, `false` otherwise.

Returns:

(TrueClass, FalseClass) —

‘true` if crawl is done, `false` otherwise.



213
214
215

# File 'lib/arachni/spider.rb', line 213

def done?
    idle? || limit_reached?
end

#fancy_sitemap ⇒ `Hash<Integer, String>`

Returns list of crawled URLs with their HTTP codes.

Returns:

(Hash<Integer, String>) —

list of crawled URLs with their HTTP codes



92
93
94

# File 'lib/arachni/spider.rb', line 92

def fancy_sitemap
    @sitemap
end

#idle? ⇒ `TrueClass`, `FalseClass`

Returns ‘true` if the queue is empty and no requests are pending, `false` otherwise.

Returns:

(TrueClass, FalseClass) —

‘true` if the queue is empty and no requests are pending, `false` otherwise.

# File 'lib/arachni/spider.rb', line 219

def idle?
    synchronize do
        @paths.empty? && @pending_requests == 0
    end
end

#on_complete(&block) ⇒ `Object`

Parameters:

block (Block) —

Sets blocks to be called once the crawler is done.

# File 'lib/arachni/spider.rb', line 176

def on_complete( &block )
    fail ArgumentError, 'Block is mandatory!' if !block_given?
    @on_complete_blocks << block
    self
end

#on_each_page(&block) ⇒ `Object`

Parameters:

block (Block) —

Sets blocks to be called every time a page is visited.

# File 'lib/arachni/spider.rb', line 160

def on_each_page( &block )
    fail ArgumentError, 'Block is mandatory!' if !block_given?
    @on_each_page_blocks << block
    self
end

#on_each_response(&block) ⇒ `Object`

Parameters:

block (Block) —

Sets blocks to be called every time a response is received.

# File 'lib/arachni/spider.rb', line 168

def on_each_response( &block )
    fail ArgumentError, 'Block is mandatory!' if !block_given?
    @on_each_response_blocks << block
    self
end

#paths ⇒ `Set<String>`

Returns Working paths, paths that haven’t yet been followed. If you want to add more paths use #push.

Returns:

(Set<String>) —

Working paths, paths that haven’t yet been followed. If you want to add more paths use #push.



82
83
84

# File 'lib/arachni/spider.rb', line 82

def paths
    @paths
end

#pause ⇒ `TrueClass`

Returns Pauses the system on a best effort basis.

Returns:

(TrueClass) —

Pauses the system on a best effort basis.



226
227
228

# File 'lib/arachni/spider.rb', line 226

def pause
    @pause = true
end

#paused? ⇒ `Bool`

Returns ‘true` if the system it paused, `false` otherwise.

Returns:

(Bool) —

‘true` if the system it paused, `false` otherwise.



237
238
239

# File 'lib/arachni/spider.rb', line 237

def paused?
    @pause ||= false
end

#push(paths, wakeup = true) ⇒ `Bool`

Pushes new paths for the crawler to follow; if the crawler has finished it will be awaken when new paths are pushed.

The paths will be sanitized and normalized (cleaned up and converted to absolute ones).

Parameters:

paths (String, Array<String>)

Returns:

(Bool) —

‘true` if push was successful, `false` otherwise (provided empty or paths that must be skipped).

# File 'lib/arachni/spider.rb', line 195

def push( paths, wakeup = true )
    return false if limit_reached?

    paths = dedup( paths )
    return false if paths.empty?

    synchronize do
        @paths |= paths
    end

    return true if !wakeup || running?
    Thread.abort_on_exception = true
    Thread.new { run }

    true
end

#resume ⇒ `TrueClass`

Returns Resumes the system.

Returns:

(TrueClass) —

Resumes the system.

# File 'lib/arachni/spider.rb', line 231

def resume
    @pause = false
    true
end

#run(pass_pages_to_block = true, &block) ⇒ `Array<String>`

Runs the Spider and passes the requested object to the block.

Parameters:

pass_pages_to_block (Bool) (defaults to: true) —

Decides weather the block should be passed Pages or Typhoeus::Responses.
block (Block) —

To be passed each page as visited.

Returns:

(Array<String>) —

sitemap

# File 'lib/arachni/spider.rb', line 106

def run( pass_pages_to_block = true, &block )
    return if running? || limit_reached? || !@opts.crawl?

    synchronize { @running = true }

    # Options could have changed so reseed.
    seed_paths

    if block_given?
        pass_pages_to_block ? on_each_page( &block ) : on_each_response( &block )
    end

    while !done?
        wait_if_paused
        while !done? && (url = next_url)
            wait_if_paused

            visit( url ) do |res|
                obj = if pass_pages_to_block
                    Page.from_response( res, @opts )
                else
                    Parser.new( res, @opts )
                end

                if @on_each_response_blocks.any?
                    call_on_each_response_blocks( res )
                end

                if @on_each_page_blocks.any?
                    call_on_each_page_blocks obj.is_a?( Page ) ?
                                                 obj :
                                                 Page.from_response( res, @opts )
                end

                distribute( obj.paths )
            end
        end

        http.run
    end

    synchronize { @running = false }

    call_on_complete_blocks

    sitemap
end

#running? ⇒ `Boolean`

Returns:

(Boolean)



154
155
156

# File 'lib/arachni/spider.rb', line 154

def running?
    synchronize { !!@running }
end

#sitemap ⇒ `Array<String>`

Returns list of crawled URLs.

Returns:

(Array<String>) —

list of crawled URLs



87
88
89

# File 'lib/arachni/spider.rb', line 87

def sitemap
    @sitemap.keys
end

#url ⇒ `Object`



75
76
77

# File 'lib/arachni/spider.rb', line 75

def url
    @opts.url
end

Class: Arachni::Spider

Overview

Direct Known Subclasses

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utilities

Methods included from UI::Output

Constructor Details

#initialize(opts = Options.instance) ⇒ Spider

Instance Attribute Details

#failures ⇒ Array<String> (readonly)

#opts ⇒ Arachni::Options (readonly)

#redirects ⇒ Array<String> (readonly)

Instance Method Details

#done? ⇒ TrueClass, FalseClass

#fancy_sitemap ⇒ Hash<Integer, String>

#idle? ⇒ TrueClass, FalseClass

#on_complete(&block) ⇒ Object

#on_each_page(&block) ⇒ Object

#on_each_response(&block) ⇒ Object

#paths ⇒ Set<String>

#pause ⇒ TrueClass

#paused? ⇒ Bool

#push(paths, wakeup = true) ⇒ Bool

#resume ⇒ TrueClass

#run(pass_pages_to_block = true, &block) ⇒ Array<String>

#running? ⇒ Boolean

#sitemap ⇒ Array<String>

#url ⇒ Object

#initialize(opts = Options.instance) ⇒ `Spider`

#failures ⇒ `Array<String>` (readonly)

#opts ⇒ `Arachni::Options` (readonly)

#redirects ⇒ `Array<String>` (readonly)

#done? ⇒ `TrueClass`, `FalseClass`

#fancy_sitemap ⇒ `Hash<Integer, String>`

#idle? ⇒ `TrueClass`, `FalseClass`

#on_complete(&block) ⇒ `Object`

#on_each_page(&block) ⇒ `Object`

#on_each_response(&block) ⇒ `Object`

#paths ⇒ `Set<String>`

#pause ⇒ `TrueClass`

#paused? ⇒ `Bool`

#push(paths, wakeup = true) ⇒ `Bool`

#resume ⇒ `TrueClass`

#run(pass_pages_to_block = true, &block) ⇒ `Array<String>`

#running? ⇒ `Boolean`

#sitemap ⇒ `Array<String>`

#url ⇒ `Object`