Class: Arachni::Spider

Inherits:

Object

Object
Arachni::Spider

show all

Includes:: Module::Utilities, UI::Output

Defined in:: lib/arachni/spider.rb

Overview

Spider class

Crawls the URL in opts and grabs the HTML code and headers.

@author: Tasos “Zapotek” Laskos

<[email protected]>
<[email protected]>

@version: 0.2.3

Instance Attribute Summary collapse

#opts ⇒ Options readonly
#redirects ⇒ Array readonly

URLs that caused redirects.
#sitemap ⇒ Array readonly

Discovered paths.

Instance Method Summary collapse

#http ⇒ Object
#initialize(opts) ⇒ Spider constructor

Constructor <br/> Instantiates Spider class with user options.
#pause! ⇒ Object
#paused? ⇒ Boolean
#redundant?(url) ⇒ Boolean
#restricted_to_paths? ⇒ Boolean
#resume! ⇒ Object
#run(parse = true, &block) ⇒ Arachni::Parser::Page

Runs the Spider and passes parsed page to the block.
#skip?(url) ⇒ Boolean
#wait_if_paused ⇒ Object

Methods included from Module::Utilities

#exception_jail, #get_path, #hash_keys_to_str, #normalize_url, #read_file, #seed, #uri_decode, #uri_encode, #uri_parse, #uri_parser, #url_sanitize

Methods included from UI::Output

#buffer, #debug!, #debug?, #flush_buffer, #mute!, #muted?, #only_positives!, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, #uncap_buffer!, #unmute!, #verbose!, #verbose?

Constructor Details

#initialize(opts) ⇒ `Spider`

Constructor <br/> Instantiates Spider class with user options.

Parameters:

opts (Options)

# File 'lib/arachni/spider.rb', line 58

def initialize( opts )
    @opts = opts

    @sitemap   = []
    @redirects = []
    @on_every_page_blocks = []

    @seed_url = @opts.url.to_s

    @extend_paths   = @opts.extend_paths   || []
    @restrict_paths = @opts.restrict_paths || []

    @paths = [ @seed_url ]

    if restricted_to_paths?
        @paths |= @sitemap = @restrict_paths
    else
        @paths |= @extend_paths
    end

    # if we have no 'include' patterns create one that will match
    # everything, like '.*'
    @opts.include =[ Regexp.new( '.*' ) ] if @opts.include.empty?
end

Instance Attribute Details

#opts ⇒ `Options` (readonly)

Returns:

(Options)



36
37
38

# File 'lib/arachni/spider.rb', line 36

def opts
  @opts
end

#redirects ⇒ `Array` (readonly)

URLs that caused redirects

Returns:

(Array)



50
51
52

# File 'lib/arachni/spider.rb', line 50

def redirects
  @redirects
end

#sitemap ⇒ `Array` (readonly)

Discovered paths

Returns:

(Array)



43
44
45

# File 'lib/arachni/spider.rb', line 43

def sitemap
  @sitemap
end

Instance Method Details

#http ⇒ `Object`



180
181
182

# File 'lib/arachni/spider.rb', line 180

def http
    Arachni::HTTP.instance
end

#pause! ⇒ `Object`



218
219
220

# File 'lib/arachni/spider.rb', line 218

def pause!
    @pause = true
end

#paused? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/arachni/spider.rb', line 226

def paused?
    @pause ||= false
    return @pause
end

#redundant?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/arachni/spider.rb', line 188

def redundant?( url )
    @opts.redundant.each_with_index {
        |redundant, i|

        if( url =~ redundant['regexp'] )

            if( @opts.redundant[i]['count'] == 0 )
                print_verbose( 'Discarding redundant page: \'' + url + '\'' )
                return true
            end

            print_info( 'Matched redundancy rule: ' +
            redundant['regexp'].to_s + ' for page \'' +
            url + '\'' )

            print_info( 'Count-down: ' + @opts.redundant[i]['count'].to_s )

            @opts.redundant[i]['count'] -= 1
        end
    }
    return false
end

#restricted_to_paths? ⇒ `Boolean`

Returns:

(Boolean)



83
84
85

# File 'lib/arachni/spider.rb', line 83

def restricted_to_paths?
    !@restrict_paths.empty?
end

#resume! ⇒ `Object`



222
223
224

# File 'lib/arachni/spider.rb', line 222

def resume!
    @pause = false
end

#run(parse = true, &block) ⇒ `Arachni::Parser::Page`

Runs the Spider and passes parsed page to the block

Parameters:

block (Block)

Returns:

(Arachni::Parser::Page)

# File 'lib/arachni/spider.rb', line 94

def run( parse = true, &block )
    return if @opts.link_count_limit == 0

    visited = []

    opts = {
        :timeout    => nil,
        :remove_id  => true,
        :follow_location => true,
        :update_cookies  => true
    }

    # we need a parser in order to have access to skip() in case
    # there's a redirect that shouldn't be followed
    seed_page = http.get( @seed_url, opts.merge( :async => false ) ).response

    print_status( "[HTTP: #{seed_page.code}] " + seed_page.effective_url )

    parser = Parser.new( @opts, seed_page )
    parser.url = @seed_url
    @paths = parser.paths | [@seed_url]

    while( !@paths.empty? )
        while( !@paths.empty? && url = parser.to_absolute( @paths.pop ) )
            next if skip?( url ) || visited.include?( url )

            wait_if_paused

            visited << url

            http.get( url, opts ).on_complete {
                |res|

                next if parser.skip?( res.effective_url )

                print_status( "[HTTP: #{res.code}] " + res.effective_url )

                if parse
                    page = Arachni::Parser::Page.from_http_response( res, @opts )
                    paths = page.paths
                    check_url = page.url
                else
                    c_parser = Parser.new( @opts, res )
                    paths = c_parser.text? ? c_parser.paths : []
                    check_url = c_parser.url
                end

                if !restricted_to_paths?
                    @sitemap |= paths

                    if !res.headers_hash['Location'].empty?
                        @redirects << res.request.url
                    end

                    @paths   |= @sitemap - visited
                end

                # call the block...if we have one
                if block
                    exception_jail{
                        if !skip?( check_url )
                            block.call( parse ? page.clone : res )
                        else
                            print_info( 'Matched skip rule.' )
                        end
                    }
                end
            }

            # make sure we obey the link count limit and
            # return if we have exceeded it.
            if( @opts.link_count_limit &&
                @opts.link_count_limit > 0 &&
                visited.size >= @opts.link_count_limit )
                http.run
                return @sitemap.uniq
            end

        end

        http.run
    end

    return @sitemap.uniq
end

#skip?(url) ⇒ `Boolean`

Returns:

(Boolean)



184
185
186

# File 'lib/arachni/spider.rb', line 184

def skip?( url )
    redundant?( url )
end

#wait_if_paused ⇒ `Object`

# File 'lib/arachni/spider.rb', line 212

def wait_if_paused
    while( paused? )
        ::IO::select( nil, nil, nil, 1 )
    end
end

Class: Arachni::Spider

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Module::Utilities

Methods included from UI::Output

Constructor Details

#initialize(opts) ⇒ Spider

Instance Attribute Details

#opts ⇒ Options (readonly)

#redirects ⇒ Array (readonly)

#sitemap ⇒ Array (readonly)

Instance Method Details

#http ⇒ Object

#pause! ⇒ Object

#paused? ⇒ Boolean

#redundant?(url) ⇒ Boolean

#restricted_to_paths? ⇒ Boolean

#resume! ⇒ Object

#run(parse = true, &block) ⇒ Arachni::Parser::Page

#skip?(url) ⇒ Boolean

#wait_if_paused ⇒ Object

#initialize(opts) ⇒ `Spider`

#opts ⇒ `Options` (readonly)

#redirects ⇒ `Array` (readonly)

#sitemap ⇒ `Array` (readonly)

#http ⇒ `Object`

#pause! ⇒ `Object`

#paused? ⇒ `Boolean`

#redundant?(url) ⇒ `Boolean`

#restricted_to_paths? ⇒ `Boolean`

#resume! ⇒ `Object`

#run(parse = true, &block) ⇒ `Arachni::Parser::Page`

#skip?(url) ⇒ `Boolean`

#wait_if_paused ⇒ `Object`