Class: Arachni::Spider
- Includes:
- Module::Utilities, UI::Output
- Defined in:
- lib/arachni/spider.rb
Overview
Spider class
Crawls the URL in opts and grabs the HTML code and headers.
@author: Tasos “Zapotek” Laskos
<[email protected]>
<[email protected]>
@version: 0.2.3
Instance Attribute Summary collapse
- #opts ⇒ Options readonly
-
#redirects ⇒ Array
readonly
URLs that caused redirects.
-
#sitemap ⇒ Array
readonly
Discovered paths.
Instance Method Summary collapse
- #http ⇒ Object
-
#initialize(opts) ⇒ Spider
constructor
Constructor <br/> Instantiates Spider class with user options.
- #pause! ⇒ Object
- #paused? ⇒ Boolean
- #redundant?(url) ⇒ Boolean
- #restricted_to_paths? ⇒ Boolean
- #resume! ⇒ Object
-
#run(parse = true, &block) ⇒ Arachni::Parser::Page
Runs the Spider and passes parsed page to the block.
- #skip?(url) ⇒ Boolean
- #wait_if_paused ⇒ Object
Methods included from Module::Utilities
#exception_jail, #get_path, #hash_keys_to_str, #normalize_url, #read_file, #seed, #uri_decode, #uri_encode, #uri_parse, #uri_parser, #url_sanitize
Methods included from UI::Output
#buffer, #debug!, #debug?, #flush_buffer, #mute!, #muted?, #only_positives!, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, #uncap_buffer!, #unmute!, #verbose!, #verbose?
Constructor Details
#initialize(opts) ⇒ Spider
Constructor <br/> Instantiates Spider class with user options.
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/arachni/spider.rb', line 58 def initialize( opts ) @opts = opts @sitemap = [] @redirects = [] @on_every_page_blocks = [] @seed_url = @opts.url.to_s @extend_paths = @opts.extend_paths || [] @restrict_paths = @opts.restrict_paths || [] @paths = [ @seed_url ] if restricted_to_paths? @paths |= @sitemap = @restrict_paths else @paths |= @extend_paths end # if we have no 'include' patterns create one that will match # everything, like '.*' @opts.include =[ Regexp.new( '.*' ) ] if @opts.include.empty? end |
Instance Attribute Details
#redirects ⇒ Array (readonly)
URLs that caused redirects
50 51 52 |
# File 'lib/arachni/spider.rb', line 50 def redirects @redirects end |
#sitemap ⇒ Array (readonly)
Discovered paths
43 44 45 |
# File 'lib/arachni/spider.rb', line 43 def sitemap @sitemap end |
Instance Method Details
#http ⇒ Object
180 181 182 |
# File 'lib/arachni/spider.rb', line 180 def http Arachni::HTTP.instance end |
#pause! ⇒ Object
218 219 220 |
# File 'lib/arachni/spider.rb', line 218 def pause! @pause = true end |
#paused? ⇒ Boolean
226 227 228 229 |
# File 'lib/arachni/spider.rb', line 226 def paused? @pause ||= false return @pause end |
#redundant?(url) ⇒ Boolean
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/arachni/spider.rb', line 188 def redundant?( url ) @opts.redundant.each_with_index { |redundant, i| if( url =~ redundant['regexp'] ) if( @opts.redundant[i]['count'] == 0 ) print_verbose( 'Discarding redundant page: \'' + url + '\'' ) return true end print_info( 'Matched redundancy rule: ' + redundant['regexp'].to_s + ' for page \'' + url + '\'' ) print_info( 'Count-down: ' + @opts.redundant[i]['count'].to_s ) @opts.redundant[i]['count'] -= 1 end } return false end |
#restricted_to_paths? ⇒ Boolean
83 84 85 |
# File 'lib/arachni/spider.rb', line 83 def restricted_to_paths? !@restrict_paths.empty? end |
#resume! ⇒ Object
222 223 224 |
# File 'lib/arachni/spider.rb', line 222 def resume! @pause = false end |
#run(parse = true, &block) ⇒ Arachni::Parser::Page
Runs the Spider and passes parsed page to the block
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/arachni/spider.rb', line 94 def run( parse = true, &block ) return if @opts.link_count_limit == 0 visited = [] opts = { :timeout => nil, :remove_id => true, :follow_location => true, :update_cookies => true } # we need a parser in order to have access to skip() in case # there's a redirect that shouldn't be followed seed_page = http.get( @seed_url, opts.merge( :async => false ) ).response print_status( "[HTTP: #{seed_page.code}] " + seed_page.effective_url ) parser = Parser.new( @opts, seed_page ) parser.url = @seed_url @paths = parser.paths | [@seed_url] while( !@paths.empty? ) while( !@paths.empty? && url = parser.to_absolute( @paths.pop ) ) next if skip?( url ) || visited.include?( url ) wait_if_paused visited << url http.get( url, opts ).on_complete { |res| next if parser.skip?( res.effective_url ) print_status( "[HTTP: #{res.code}] " + res.effective_url ) if parse page = Arachni::Parser::Page.from_http_response( res, @opts ) paths = page.paths check_url = page.url else c_parser = Parser.new( @opts, res ) paths = c_parser.text? ? c_parser.paths : [] check_url = c_parser.url end if !restricted_to_paths? @sitemap |= paths if !res.headers_hash['Location'].empty? @redirects << res.request.url end @paths |= @sitemap - visited end # call the block...if we have one if block exception_jail{ if !skip?( check_url ) block.call( parse ? page.clone : res ) else print_info( 'Matched skip rule.' ) end } end } # make sure we obey the link count limit and # return if we have exceeded it. if( @opts.link_count_limit && @opts.link_count_limit > 0 && visited.size >= @opts.link_count_limit ) http.run return @sitemap.uniq end end http.run end return @sitemap.uniq end |
#skip?(url) ⇒ Boolean
184 185 186 |
# File 'lib/arachni/spider.rb', line 184 def skip?( url ) redundant?( url ) end |
#wait_if_paused ⇒ Object
212 213 214 215 216 |
# File 'lib/arachni/spider.rb', line 212 def wait_if_paused while( paused? ) ::IO::select( nil, nil, nil, 1 ) end end |