Class: Arachni::Spider

Inherits:
Object show all
Includes:
UI::Output, Utilities
Defined in:
lib/arachni/spider.rb

Overview

Crawls the target webapp until there are no new paths left.

Author:

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utilities

#cookie_encode, #cookies_from_document, #cookies_from_file, #cookies_from_response, #exception_jail, #exclude_path?, #extract_domain, #form_decode, #form_encode, #form_parse_request_body, #forms_from_document, #forms_from_response, #get_path, #hash_keys_to_str, #html_decode, #html_encode, #include_path?, #links_from_document, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_query, #parse_set_cookie, #parse_url_vars, #path_in_domain?, #path_too_deep?, #remove_constants, #seed, #skip_path?, #to_absolute, #uri_decode, #uri_encode, #uri_parse, #uri_parser, #url_sanitize

Methods included from UI::Output

#debug?, #debug_off, #debug_on, #disable_only_positives, #flush_buffer, #mute, #muted?, old_reset_output_options, #only_positives, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, reset_output_options, #set_buffer_cap, #uncap_buffer, #unmute, #verbose, #verbose?

Constructor Details

#initialize(opts = Options.instance) ⇒ Spider

Instantiates Spider class with user options.

Parameters:



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/arachni/spider.rb', line 46

def initialize( opts = Options.instance )
    @opts = opts

    @sitemap   = {}
    @redirects = []
    @paths     = []
    @visited   = Set.new

    @on_each_page_blocks     = []
    @on_each_response_blocks = []
    @on_complete_blocks      = []

    @pass_pages       = true
    @pending_requests = 0

    seed_paths
end

Instance Attribute Details

#optsArachni::Options (readonly)

Returns:



36
37
38
# File 'lib/arachni/spider.rb', line 36

def opts
  @opts
end

#redirectsArray<String> (readonly)

Returns URLs that caused redirects.

Returns:



39
40
41
# File 'lib/arachni/spider.rb', line 39

def redirects
  @redirects
end

Instance Method Details

#done?TrueClass, FalseClass

Returns true if crawl is done, false otherwise.

Returns:

  • (TrueClass, FalseClass)

    true if crawl is done, false otherwise



196
197
198
# File 'lib/arachni/spider.rb', line 196

def done?
    idle? || limit_reached?
end

#fancy_sitemapHash<Integer, String>

Returns list of crawled URLs with their HTTP codes.

Returns:

  • (Hash<Integer, String>)

    list of crawled URLs with their HTTP codes



82
83
84
# File 'lib/arachni/spider.rb', line 82

def fancy_sitemap
    @sitemap
end

#idle?TrueClass, FalseClass

Returns true if the queue is empty and no requests are pending, false otherwise.

Returns:

  • (TrueClass, FalseClass)

    true if the queue is empty and no requests are pending, false otherwise



202
203
204
# File 'lib/arachni/spider.rb', line 202

def idle?
    @paths.empty? && @pending_requests == 0
end

#on_complete(&block) ⇒ Object

Sets blocks to be called once the crawler is done.

Parameters:

  • block (Block)


166
167
168
169
170
# File 'lib/arachni/spider.rb', line 166

def on_complete( &block )
    fail 'Block is mandatory!' if !block_given?
    @on_complete_blocks << block
    self
end

#on_each_page(&block) ⇒ Object

Sets blocks to be called every time a page is visited.

Parameters:

  • block (Block)


144
145
146
147
148
# File 'lib/arachni/spider.rb', line 144

def on_each_page( &block )
    fail 'Block is mandatory!' if !block_given?
    @on_each_page_blocks << block
    self
end

#on_each_response(&block) ⇒ Object

Sets blocks to be called every time a response is received.

Parameters:

  • block (Block)


155
156
157
158
159
# File 'lib/arachni/spider.rb', line 155

def on_each_response( &block )
    fail 'Block is mandatory!' if !block_given?
    @on_each_response_blocks << block
    self
end

#pathsArray<String>

Returns Working paths, paths that haven’t yet been followed. You’ll actually get a copy of the working paths and not the actual object itself; if you want to add more paths use #push.

Returns:

  • (Array<String>)

    Working paths, paths that haven’t yet been followed. You’ll actually get a copy of the working paths and not the actual object itself; if you want to add more paths use #push.



72
73
74
# File 'lib/arachni/spider.rb', line 72

def paths
    @paths.clone
end

#pauseTrueClass

Returns pauses the system on a best effort basis.

Returns:

  • (TrueClass)

    pauses the system on a best effort basis



207
208
209
# File 'lib/arachni/spider.rb', line 207

def pause
    @pause = true
end

#paused?Bool

Returns true if the system it paused, false otherwise.

Returns:

  • (Bool)

    true if the system it paused, false otherwise



218
219
220
# File 'lib/arachni/spider.rb', line 218

def paused?
    @pause ||= false
end

#push(paths) ⇒ Bool

Pushes new paths for the crawler to follow; if the crawler has finished it will be awaken when new paths are pushed.

The paths will be sanitized and normalized (cleaned up and converted to absolute ones).

Parameters:

Returns:

  • (Bool)

    true if push was successful, false otherwise (provided empty or paths that must be skipped)



183
184
185
186
187
188
189
190
191
192
193
# File 'lib/arachni/spider.rb', line 183

def push( paths )
    paths = dedup( paths )
    return false if paths.empty?

    @paths |= paths
    @paths.uniq!

    # REVIEW: This may cause segfaults, Typhoeus::Hydra doesn't like threads.
    #Thread.new { run } if idle? # wake up the crawler
    true
end

#resumeTrueClass

Returns resumes the system on a best effort basis.

Returns:

  • (TrueClass)

    resumes the system on a best effort basis



212
213
214
215
# File 'lib/arachni/spider.rb', line 212

def resume
    @pause = false
    true
end

#run(pass_pages_to_block = true, &block) ⇒ Array<String>

Runs the Spider and passes the requested object to the block.

Parameters:

  • pass_pages_to_block (Bool) (defaults to: true)

    decides weather the block should be passed [Arachni::Page]s or [Typhoeus::Response]s

  • block (Block)

    to be passed each page as visited

Returns:



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/arachni/spider.rb', line 95

def run( pass_pages_to_block = true, &block )
    return if !@opts.crawl?

    # options could have changed so reseed
    seed_paths

    if block_given?
        pass_pages_to_block ? on_each_page( &block ) : on_each_response( &block )
    end

    while !done?
        wait_if_paused
        while !done? && url = @paths.shift
            wait_if_paused

            visit( url ) do |res|
                obj = if pass_pages_to_block
                    Page.from_response( res, @opts )
                else
                    Parser.new( res, @opts )
                end

                if @on_each_response_blocks.any?
                    call_on_each_response_blocks( res )
                end

                if @on_each_page_blocks.any?
                    call_on_each_page_blocks( pass_pages_to_block ? obj : Page.from_response( res, @opts ) )
                end

                push( obj.paths )
            end
        end

        http.run
    end

    http.run

    call_on_complete_blocks

    sitemap
end

#sitemapArray<String>

Returns list of crawled URLs.

Returns:



77
78
79
# File 'lib/arachni/spider.rb', line 77

def sitemap
    @sitemap.keys
end

#urlObject



64
65
66
# File 'lib/arachni/spider.rb', line 64

def url
    @opts.url
end