Class: Arachni::Spider
- Includes:
- UI::Output, Utilities
- Defined in:
- lib/arachni/spider.rb
Overview
Crawls the target webapp until there are no new paths left.
Direct Known Subclasses
Constant Summary collapse
- MAX_TRIES =
How many times to retry failed requests.
5
Instance Attribute Summary collapse
-
#failures ⇒ Array<String>
readonly
URLs that elicited no response from the server.
- #opts ⇒ Arachni::Options readonly
-
#redirects ⇒ Array<String>
readonly
URLs that caused redirects.
Instance Method Summary collapse
-
#done? ⇒ TrueClass, FalseClass
‘true` if crawl is done, `false` otherwise.
-
#fancy_sitemap ⇒ Hash<Integer, String>
List of crawled URLs with their HTTP codes.
-
#idle? ⇒ TrueClass, FalseClass
‘true` if the queue is empty and no requests are pending, `false` otherwise.
-
#initialize(opts = Options.instance) ⇒ Spider
constructor
Instantiates Spider class with user options.
- #on_complete(&block) ⇒ Object
- #on_each_page(&block) ⇒ Object
- #on_each_response(&block) ⇒ Object
-
#paths ⇒ Set<String>
Working paths, paths that haven’t yet been followed.
-
#pause ⇒ TrueClass
Pauses the system on a best effort basis.
-
#paused? ⇒ Bool
‘true` if the system it paused, `false` otherwise.
-
#push(paths, wakeup = true) ⇒ Bool
Pushes new paths for the crawler to follow; if the crawler has finished it will be awaken when new paths are pushed.
-
#resume ⇒ TrueClass
Resumes the system.
-
#run(pass_pages_to_block = true, &block) ⇒ Array<String>
Runs the Spider and passes the requested object to the block.
- #running? ⇒ Boolean
-
#sitemap ⇒ Array<String>
List of crawled URLs.
- #url ⇒ Object
Methods included from Utilities
#available_port, #cookie_encode, #cookies_from_document, #cookies_from_file, #cookies_from_response, #exception_jail, #exclude_path?, #extract_domain, #follow_protocol?, #form_decode, #form_encode, #form_parse_request_body, #forms_from_document, #forms_from_response, #generate_token, #get_path, #html_decode, #html_encode, #include_path?, #links_from_document, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_query, #parse_set_cookie, #parse_url_vars, #path_in_domain?, #path_too_deep?, #port_available?, #rand_port, #redundant_path?, #remove_constants, #seed, #skip_page?, #skip_path?, #skip_resource?, #to_absolute, #uri_decode, #uri_encode, #uri_parse, #uri_parser, #url_sanitize
Methods included from UI::Output
#debug?, #debug_off, #debug_on, #disable_only_positives, #error_logfile, #flush_buffer, #log_error, #mute, #muted?, old_reset_output_options, #only_positives, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, reset_output_options, #set_buffer_cap, #set_error_logfile, #uncap_buffer, #unmute, #verbose, #verbose?
Constructor Details
#initialize(opts = Options.instance) ⇒ Spider
Instantiates Spider class with user options.
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/arachni/spider.rb', line 53 def initialize( opts = Options.instance ) @opts = opts @mutex = Mutex.new @sitemap = {} @redirects = [] @paths = Set.new @visited = Support::LookUp::HashSet.new @on_each_page_blocks = [] @on_each_response_blocks = [] @on_complete_blocks = [] @pass_pages = true @pending_requests = 0 @retries = {} @failures = [] seed_paths end |
Instance Attribute Details
#failures ⇒ Array<String> (readonly)
Returns URLs that elicited no response from the server. Not determined by HTTP status codes, we’re talking network failures here.
46 47 48 |
# File 'lib/arachni/spider.rb', line 46 def failures @failures end |
#opts ⇒ Arachni::Options (readonly)
38 39 40 |
# File 'lib/arachni/spider.rb', line 38 def opts @opts end |
Instance Method Details
#done? ⇒ TrueClass, FalseClass
Returns ‘true` if crawl is done, `false` otherwise.
213 214 215 |
# File 'lib/arachni/spider.rb', line 213 def done? idle? || limit_reached? end |
#fancy_sitemap ⇒ Hash<Integer, String>
Returns list of crawled URLs with their HTTP codes.
92 93 94 |
# File 'lib/arachni/spider.rb', line 92 def fancy_sitemap @sitemap end |
#idle? ⇒ TrueClass, FalseClass
Returns ‘true` if the queue is empty and no requests are pending, `false` otherwise.
219 220 221 222 223 |
# File 'lib/arachni/spider.rb', line 219 def idle? synchronize do @paths.empty? && @pending_requests == 0 end end |
#on_complete(&block) ⇒ Object
176 177 178 179 180 |
# File 'lib/arachni/spider.rb', line 176 def on_complete( &block ) fail ArgumentError, 'Block is mandatory!' if !block_given? @on_complete_blocks << block self end |
#on_each_page(&block) ⇒ Object
160 161 162 163 164 |
# File 'lib/arachni/spider.rb', line 160 def on_each_page( &block ) fail ArgumentError, 'Block is mandatory!' if !block_given? @on_each_page_blocks << block self end |
#on_each_response(&block) ⇒ Object
168 169 170 171 172 |
# File 'lib/arachni/spider.rb', line 168 def on_each_response( &block ) fail ArgumentError, 'Block is mandatory!' if !block_given? @on_each_response_blocks << block self end |
#paths ⇒ Set<String>
Returns Working paths, paths that haven’t yet been followed. If you want to add more paths use #push.
82 83 84 |
# File 'lib/arachni/spider.rb', line 82 def paths @paths end |
#pause ⇒ TrueClass
Returns Pauses the system on a best effort basis.
226 227 228 |
# File 'lib/arachni/spider.rb', line 226 def pause @pause = true end |
#paused? ⇒ Bool
Returns ‘true` if the system it paused, `false` otherwise.
237 238 239 |
# File 'lib/arachni/spider.rb', line 237 def paused? @pause ||= false end |
#push(paths, wakeup = true) ⇒ Bool
Pushes new paths for the crawler to follow; if the crawler has finished it will be awaken when new paths are pushed.
The paths will be sanitized and normalized (cleaned up and converted to absolute ones).
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
# File 'lib/arachni/spider.rb', line 195 def push( paths, wakeup = true ) return false if limit_reached? paths = dedup( paths ) return false if paths.empty? synchronize do @paths |= paths end return true if !wakeup || running? Thread.abort_on_exception = true Thread.new { run } true end |
#resume ⇒ TrueClass
Returns Resumes the system.
231 232 233 234 |
# File 'lib/arachni/spider.rb', line 231 def resume @pause = false true end |
#run(pass_pages_to_block = true, &block) ⇒ Array<String>
Runs the Spider and passes the requested object to the block.
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/arachni/spider.rb', line 106 def run( pass_pages_to_block = true, &block ) return if running? || limit_reached? || !@opts.crawl? synchronize { @running = true } # Options could have changed so reseed. seed_paths if block_given? pass_pages_to_block ? on_each_page( &block ) : on_each_response( &block ) end while !done? wait_if_paused while !done? && (url = next_url) wait_if_paused visit( url ) do |res| obj = if pass_pages_to_block Page.from_response( res, @opts ) else Parser.new( res, @opts ) end if @on_each_response_blocks.any? call_on_each_response_blocks( res ) end if @on_each_page_blocks.any? call_on_each_page_blocks obj.is_a?( Page ) ? obj : Page.from_response( res, @opts ) end distribute( obj.paths ) end end http.run end synchronize { @running = false } call_on_complete_blocks sitemap end |
#running? ⇒ Boolean
154 155 156 |
# File 'lib/arachni/spider.rb', line 154 def running? synchronize { !!@running } end |
#sitemap ⇒ Array<String>
Returns list of crawled URLs.
87 88 89 |
# File 'lib/arachni/spider.rb', line 87 def sitemap @sitemap.keys end |
#url ⇒ Object
75 76 77 |
# File 'lib/arachni/spider.rb', line 75 def url @opts.url end |