Class: Arachni::Spider

Inherits:

Object

Object
Arachni::Spider

show all

Includes:: Module::Utilities, UI::Output

Defined in:: lib/spider.rb

Overview

Spider class

Crawls the URL in opts and grabs the HTML code and headers.

@author: Tasos “Zapotek” Laskos

<[email protected]>
<[email protected]>

@version: 0.1

Instance Attribute Summary collapse

#on_every_page_blocks ⇒ Proc readonly

Code block to be executed on each page.
#opts ⇒ Options readonly
#pages ⇒ Object readonly

Returns the value of attribute pages.
#sitemap ⇒ Array readonly

Sitemap, array of links.

Instance Method Summary collapse

#initialize(opts) ⇒ Spider constructor

Constructor <br/> Instantiates Spider class with user options.
#on_every_page(&block) ⇒ self

Hook for further analysis of pages, statistics etc.
#run(&block) ⇒ Arachni::Parser::Page

Runs the Spider and passes parsed page to the block.
#url_sanitize(url) ⇒ Object

Decodes URLs to reverse multiple encodes and removes NULL characters.

Methods included from Module::Utilities

#exception_jail, #get_path, #normalize_url, #read_file, #seed

Methods included from UI::Output

#buffer, #debug!, #debug?, #flush_buffer, #mute!, #muted?, #only_positives!, #only_positives?, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, #unmute!, #verbose!, #verbose?

Constructor Details

#initialize(opts) ⇒ `Spider`

Constructor <br/> Instantiates Spider class with user options.

Parameters:

opts (Options)

# File 'lib/spider.rb', line 59

def initialize( opts )
    @opts = opts

    @anemone_opts = {
        :threads              =>  1,
        :discard_page_bodies  =>  false,
        :delay                =>  0,
        :obey_robots_txt      =>  false,
        :depth_limit          =>  false,
        :link_count_limit     =>  false,
        :redirect_limit       =>  false,
        :storage              =>  nil,
        :cookies              =>  nil,
        :accept_cookies       =>  true,
        :proxy_addr           =>  nil,
        :proxy_port           =>  nil,
        :proxy_user           =>  nil,
        :proxy_pass           =>  nil
    }

    hash_opts = @opts.to_h
    @anemone_opts.each_pair {
        |k, v|
        @anemone_opts[k] = hash_opts[k.to_s] if hash_opts[k.to_s]
    }

    @anemone_opts = @anemone_opts.merge( hash_opts )

    @sitemap = []
    @on_every_page_blocks = []

    # if we have no 'include' patterns create one that will match
    # everything, like '.*'
    @opts.include =[ Regexp.new( '.*' ) ] if @opts.include.empty?
end

Instance Attribute Details

#on_every_page_blocks ⇒ `Proc` (readonly)

Code block to be executed on each page

Returns:

(Proc)



51
52
53

# File 'lib/spider.rb', line 51

def on_every_page_blocks
  @on_every_page_blocks
end

#opts ⇒ `Options` (readonly)

Returns:

(Options)



35
36
37

# File 'lib/spider.rb', line 35

def opts
  @opts
end

#pages ⇒ `Object` (readonly)

Returns the value of attribute pages.



37
38
39

# File 'lib/spider.rb', line 37

def pages
  @pages
end

#sitemap ⇒ `Array` (readonly)

Sitemap, array of links

Returns:

(Array)



44
45
46

# File 'lib/spider.rb', line 44

def sitemap
  @sitemap
end

Instance Method Details

#on_every_page(&block) ⇒ `self`

Hook for further analysis of pages, statistics etc.

Parameters:

block (Proc) —

code to be executed for every page

Returns:

(self)

# File 'lib/spider.rb', line 194

def on_every_page( &block )
    @on_every_page_blocks.push( block )
    self
end

#run(&block) ⇒ `Arachni::Parser::Page`

Runs the Spider and passes parsed page to the block

Parameters:

block (Block)

Returns:

(Arachni::Parser::Page)

# File 'lib/spider.rb', line 102

def run( &block )
    return if @opts.link_count_limit == 0

    i = 1
    # start the crawl
    Anemone.crawl( @opts.url, @anemone_opts ) {
        |anemone|

        # apply 'exclude' patterns
        anemone.skip_links_like( @opts.exclude ) if @opts.exclude

        # apply 'include' patterns and grab matching pages
        # as they are discovered
        anemone.on_pages_like( @opts.include ) {
            |page|

            @pages = anemone.pages.keys || []

            url = url_sanitize( page.url.to_s )

            # something went kaboom, tell the user and skip the page
            if page.error
                print_error( "[Error: " + (page.error.to_s) + "] " + url )
                print_debug_backtrace( page.error )
                next
            end

            # push the url in the sitemap
            @sitemap.push( url )

            print_line
            print_status( "[HTTP: #{page.code}] " + url )

            # call the block...if we have one
            if block
                exception_jail{
                    new_page = Arachni::Parser.new( @opts,
                        Typhoeus::Response.new(
                            :effective_url => url,
                            :body          => page.body,
                            :headers_hash  => page.headers
                        )
                    ).run
                    new_page.code   = page.code
                    new_page.method = 'GET'
                    block.call( new_page.clone )
                }
            end

            # run blocks specified later
            @on_every_page_blocks.each {
                |block|
                block.call( page )
            }

            # we don't need the HTML doc anymore
            page.discard_doc!( )

            # make sure we obey the link count limit and
            # return if we have exceeded it.
            if( @opts.link_count_limit &&
                @opts.link_count_limit <= i )
                return @sitemap.uniq
            end

            i+=1
        }
    }

    return @sitemap.uniq
end

#url_sanitize(url) ⇒ `Object`

Decodes URLs to reverse multiple encodes and removes NULL characters

# File 'lib/spider.rb', line 177

def url_sanitize( url )

    while( url =~ /%/ )
        url = ( URI.decode( url ).to_s.unpack( 'A*' )[0] )
    end

    return url
end

Class: Arachni::Spider

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Module::Utilities

Methods included from UI::Output

Constructor Details

#initialize(opts) ⇒ Spider

Instance Attribute Details

#on_every_page_blocks ⇒ Proc (readonly)

#opts ⇒ Options (readonly)

#pages ⇒ Object (readonly)

#sitemap ⇒ Array (readonly)

Instance Method Details

#on_every_page(&block) ⇒ self

#run(&block) ⇒ Arachni::Parser::Page

#url_sanitize(url) ⇒ Object

#initialize(opts) ⇒ `Spider`

#on_every_page_blocks ⇒ `Proc` (readonly)

#opts ⇒ `Options` (readonly)

#pages ⇒ `Object` (readonly)

#sitemap ⇒ `Array` (readonly)

#on_every_page(&block) ⇒ `self`

#run(&block) ⇒ `Arachni::Parser::Page`

#url_sanitize(url) ⇒ `Object`