Class: Wmap::UrlCrawler

#crawl_depth ⇒ Object

Returns the value of attribute crawl_depth.
#crawl_done ⇒ Object readonly

Returns the value of attribute crawl_done.
#crawl_page_limit ⇒ Object

Returns the value of attribute crawl_page_limit.
#crawl_start ⇒ Object readonly

Returns the value of attribute crawl_start.
#data_dir ⇒ Object

Returns the value of attribute data_dir.
#discovered_urls_by_crawler ⇒ Object readonly

Returns the value of attribute discovered_urls_by_crawler.
#http_timeout ⇒ Object

Returns the value of attribute http_timeout.
#max_parallel ⇒ Object

Returns the value of attribute max_parallel.
#signature_file ⇒ Object

Returns the value of attribute signature_file.
#tag_file ⇒ Object

Returns the value of attribute tag_file.
#tag_signatures ⇒ Object readonly

Returns the value of attribute tag_signatures.
#tag_store ⇒ Object readonly

Returns the value of attribute tag_store.
#user_agent ⇒ Object

Returns the value of attribute user_agent.
#verbose ⇒ Object

Returns the value of attribute verbose.
#visited_urls_by_crawler ⇒ Object readonly

Returns the value of attribute visited_urls_by_crawler.

Instance Method Summary collapse

#crawl(url) ⇒ Object (also: #query)

by crawling ‘www.yahoo.com/’ it could discover ‘login.yahoo.com/’.
#crawl_worker(url0) ⇒ Object

The worker instance of crawler who perform the labour work.
#crawl_workers(targets, num = @max_parallel) ⇒ Object (also: #crawls)

Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time each child process will continuously work on the target pool until all the works are done.
#crawl_workers_on_file(file) ⇒ Object (also: #query_file, #crawl_file)

Fast crawling method - build the target pool from the input file.
#get_discovered_sites_by_crawler ⇒ Object (also: #get_sites)

Method to retrieve discovery site result.
#initialize(params = {}) ⇒ UrlCrawler constructor

Crawler instance default variables.
#pre_crawl(url) ⇒ Object

Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
#print_discovered_urls_by_crawler ⇒ Object (also: #print)

Method to print out discovery URL result.
#save_discovered_urls(file) ⇒ Object (also: #save)

Method to save URL discovery result.

Constructor Details

#initialize(params = {}) ⇒ `UrlCrawler`

Crawler instance default variables

# File 'lib/wmap/url_crawler.rb', line 32

def initialize (params = {})
  @verbose=params.fetch(:verbose, false)
  @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
  @http_timeout=params.fetch(:http_timeout, 5000)
  @crawl_depth=params.fetch(:crawl_depth, 4)
  @crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
  @max_parallel=params.fetch(:max_parallel, 40)
  @user_agent=params.fetch(:user_agent, "OWASP WMAP Spider")
  # Discovered data store
  @discovered_urls_by_crawler=Hash.new
  @visited_urls_by_crawler=Hash.new
  @crawl_start=Hash.new
  @crawl_done=Hash.new
  Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
  @log_dir=@data_dir + "/../logs/"
  Dir.mkdir(@log_dir) unless Dir.exist?(@log_dir)
  @log_file=@log_dir + "crawler.log"
end

Instance Attribute Details

#crawl_depth ⇒ `Object`

Returns the value of attribute crawl_depth.



20
21
22

# File 'lib/wmap/url_crawler.rb', line 20

def crawl_depth
  @crawl_depth
end

#crawl_done ⇒ `Object` (readonly)

Returns the value of attribute crawl_done.



22
23
24

# File 'lib/wmap/url_crawler.rb', line 22

def crawl_done
  @crawl_done
end

#crawl_page_limit ⇒ `Object`

Returns the value of attribute crawl_page_limit.



20
21
22

# File 'lib/wmap/url_crawler.rb', line 20

def crawl_page_limit
  @crawl_page_limit
end

#crawl_start ⇒ `Object` (readonly)

Returns the value of attribute crawl_start.



22
23
24

# File 'lib/wmap/url_crawler.rb', line 22

def crawl_start
  @crawl_start
end

#data_dir ⇒ `Object`

Returns the value of attribute data_dir.



20
21
22

# File 'lib/wmap/url_crawler.rb', line 20

def data_dir
  @data_dir
end

#discovered_urls_by_crawler ⇒ `Object` (readonly)

Returns the value of attribute discovered_urls_by_crawler.



22
23
24

# File 'lib/wmap/url_crawler.rb', line 22

def discovered_urls_by_crawler
  @discovered_urls_by_crawler
end

#http_timeout ⇒ `Object`

Returns the value of attribute http_timeout.



20
21
22

# File 'lib/wmap/url_crawler.rb', line 20

def http_timeout
  @http_timeout
end

#max_parallel ⇒ `Object`

Returns the value of attribute max_parallel.



20
21
22

# File 'lib/wmap/url_crawler.rb', line 20

def max_parallel
  @max_parallel
end

#signature_file ⇒ `Object`

Returns the value of attribute signature_file.



15
16
17

# File 'lib/wmap/url_crawler/adware_tag.rb', line 15

def signature_file
  @signature_file
end

#tag_file ⇒ `Object`

Returns the value of attribute tag_file.



15
16
17

# File 'lib/wmap/url_crawler/adware_tag.rb', line 15

def tag_file
  @tag_file
end

#tag_signatures ⇒ `Object` (readonly)

Returns the value of attribute tag_signatures.



16
17
18

# File 'lib/wmap/url_crawler/adware_tag.rb', line 16

def tag_signatures
  @tag_signatures
end

#tag_store ⇒ `Object` (readonly)

Returns the value of attribute tag_store.



16
17
18

# File 'lib/wmap/url_crawler/adware_tag.rb', line 16

def tag_store
  @tag_store
end

#user_agent ⇒ `Object`

Returns the value of attribute user_agent.



20
21
22

# File 'lib/wmap/url_crawler.rb', line 20

def user_agent
  @user_agent
end

#verbose ⇒ `Object`

Returns the value of attribute verbose.



20
21
22

# File 'lib/wmap/url_crawler.rb', line 20

def verbose
  @verbose
end

#visited_urls_by_crawler ⇒ `Object` (readonly)

Returns the value of attribute visited_urls_by_crawler.



22
23
24

# File 'lib/wmap/url_crawler.rb', line 22

def visited_urls_by_crawler
  @visited_urls_by_crawler
end

Instance Method Details

#crawl(url) ⇒ `Object` Also known as: query

by crawling ‘www.yahoo.com/’ it could discover ‘login.yahoo.com/’

# File 'lib/wmap/url_crawler.rb', line 72

def crawl(url)
  puts "Start web crawling on #{url}"
  result=Array.new
  url=url.chomp.strip
  result.push(url_2_site(url))
  raise "Error! Invalid url format: #{urls}" unless is_url?(url)
  # Add logic to profile the web server before crawling; this is used to optimize the crawling speed
  pre_crawl(url)
  status = Timeout::timeout(Crawl_timeout/1000) {
    result+=crawl_worker(url).keys
  }
  puts "Web crawling time-out on #{url}: #{status}" if @verbose
  return result
rescue => ee
  puts "Exception on method #{__method__} for URL #{url}: #{ee}"
  return result
end

#crawl_worker(url0) ⇒ `Object`

The worker instance of crawler who perform the labour work

# File 'lib/wmap/url_crawler.rb', line 92

def crawl_worker(url0)
  puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
  # Input URL sanity check first
  if is_url?(url0)
    host=url_2_host(url0)
    ip=host_2_ip(host).to_s
    raise "Invalid IP address: #{url0}" if ip.nil?
    port=url_2_port(url0).to_s
    raise "Invalid port number: #{url0}" if port.nil?
  else
    raise "Invalid URL: #{url0}. Please check it out with your browser again."
  end
  log_info=Hash.new
  log_info[1]="Start working on #{url0}"
  url_stores=Hash.new
  url_stores[url0]=true unless url_stores.key?(url0)
  @discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
  @crawl_start[url0]=true unless @crawl_start.key?(url0)
#     $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
  @crawl_depth.times do
    url_stores.keys.each do |url|
      # 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
      next if @visited_urls_by_crawler.key?(url)
      url_object = open_url(url)
      next if url_object == nil
      url = update_url_if_redirected(url, url_object)
      url_body = read_url(url)
      # Protection code - to avoid parsing failure on the empty or nil object
      next if url_body.nil? or url_body.empty?
      url_stores[url]=true unless url_stores.key?(url)
      @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
#         $discovered_urls[url]=true unless $discovered_urls.key?(url)
      doc = Nokogiri::HTML(url_body)
      next if doc == nil
      if url_stores.size >= @crawl_page_limit
        #@visited_urls_by_crawler.merge!(url_stores)
        @discovered_urls_by_crawler.merge!(url_stores)
#           $discovered_urls.merge!(url_stores)
        puts "Finish web crawling the url: #{url0}"
        return url_stores
      end
      page_urls = find_urls_on_page(doc, url)
      page_urls.uniq!
      page_urls.map do |y|
        y=normalize_url(y)
        url_stores[y]=true unless url_stores.key?(y)
        @discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
#           $discovered_urls[y]=true unless $discovered_urls.key?(y)
      end
    end
  end
  puts "Finish web crawling on: #{url0}"
  log_info[2]="Finish working on: #{url0}"
  wlog(log_info, "UrlCrawler", @log_file)
  @crawl_done[url0]=true unless @crawl_done.key?(url0)
  return url_stores
rescue => ee
  puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
  log_info[3]="Exception on #{url0}"
  wlog(log_info,"UrlCrawler",@log_file)
  return url_stores
end

#crawl_workers(targets, num = @max_parallel) ⇒ `Object` Also known as: crawls

Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time each child process will continuously work on the target pool until all the works are done

# File 'lib/wmap/url_crawler.rb', line 157

def crawl_workers (targets,num=@max_parallel)
  raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
  puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
  #puts "This could be awhile depending on the list size. Please be patient ..."
  # 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
  targets -= ["", nil]
  uniq_sites=Hash.new
  targets.dup.map do |target|
    if is_url?(target)
      host=url_2_host(target)
      ip=host_2_ip(host).to_s
      next if ip.nil?
      port=url_2_port(target).to_s
      next if port.nil?
      site_key=ip+":"+port
      unless uniq_sites.key?(site_key)
        uniq_sites[site_key]=target
      end
    end
  end
  puts "Sanitization done! " if @verbose
  puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
  puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
  raise "Error: target list is empty!" if targets.size < 1
  Parallel.map(uniq_sites.values, :in_processes => num) { |target|
    puts "Working on #{target} ..." if @verbose
    crawl(target)
  }.dup.each do |process|
    puts "process.inspect: #{process}" if @verbose
    urls=process
    urls-=["",nil] unless urls.nil?
    if urls.nil?
      next
    elsif urls.empty?
      next
      #do nothing
    else
      urls.map do |url|
        url.strip!
        @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
        #$discovered_urls[url]=true unless $discovered_urls.key?(url)
      end
    end
  end
  #return sites
  return @discovered_urls_by_crawler.keys
rescue Exception => ee
  puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
end

#crawl_workers_on_file(file) ⇒ `Object` Also known as: query_file, crawl_file

Fast crawling method - build the target pool from the input file

# File 'lib/wmap/url_crawler.rb', line 210

def crawl_workers_on_file (file)
  puts "Web crawl the list of targets from file: #{file}"
  targets=file_2_list(file)
  sites=crawl_workers(targets,num=@max_parallel)
  return sites
rescue => ee
   puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end

#get_discovered_sites_by_crawler ⇒ `Object` Also known as: get_sites

Method to retrieve discovery site result

# File 'lib/wmap/url_crawler.rb', line 328

def get_discovered_sites_by_crawler
  puts "Print summary report of discovered sites. " if @verbose
  puts "\nSummary Report of Discovered Sites from the Crawler:"
  sites = Hash.new
  @discovered_urls_by_crawler.keys.each do |url|
    site=url_2_site(url)
    sites[site]=true unless sites.key?(site)
  end
  sites.keys.map { |site| puts site }
  puts "Total: #{sites.size}"
  puts "End of the summary"
  return sites.keys
 rescue => ee
  puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end

#pre_crawl(url) ⇒ `Object`

Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.

# File 'lib/wmap/url_crawler.rb', line 52

def pre_crawl(url)
  begin
    puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose
    host=url_2_host(url)
    # Use the following formula to 'guess' the right http time-out threshold for the scanner
    nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i
    if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout
      @http_timeout = Max_http_timeout
    else
      @http_timeout = 1500 + nwk_to*2
    end
    puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose
  rescue Exception => ee
    puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
    @http_timeout = Max_http_timeout
  end
end

#print_discovered_urls_by_crawler ⇒ `Object` Also known as: print

Method to print out discovery URL result

# File 'lib/wmap/url_crawler.rb', line 302

def print_discovered_urls_by_crawler
  puts "Print discovered url by the crawler. " if @verbose
  puts "\nSummary Report of Discovered URLs from the Crawler:"
  @discovered_urls_by_crawler.keys.each do |url|
    puts url
  end
  puts "Total: #{@discovered_urls_by_crawler.keys.size}"
  puts "End of the summary"
 rescue => ee
   puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end

#save_discovered_urls(file) ⇒ `Object` Also known as: save

Method to save URL discovery result

# File 'lib/wmap/url_crawler.rb', line 317

def save_discovered_urls (file)
  puts "Save discovered urls by the crawler to file: #{file} "
  list_2_file(@discovered_urls_by_crawler.keys, file)
  puts "Done!"
 rescue => ee
   puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end

Class: Wmap::UrlCrawler

Overview

Direct Known Subclasses

Defined Under Namespace

Constant Summary collapse

Constants included from Wmap::Utils::UrlMagic

Constants included from Wmap::Utils::DomainRoot

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utils

Methods included from Wmap::Utils::Logger

Methods included from Wmap::Utils::UrlMagic

Methods included from Wmap::Utils::DomainRoot

Constructor Details

#initialize(params = {}) ⇒ UrlCrawler

Instance Attribute Details

#crawl_depth ⇒ Object

#crawl_done ⇒ Object (readonly)

#crawl_page_limit ⇒ Object

#crawl_start ⇒ Object (readonly)

#data_dir ⇒ Object

#discovered_urls_by_crawler ⇒ Object (readonly)

#http_timeout ⇒ Object

#max_parallel ⇒ Object

#signature_file ⇒ Object

#tag_file ⇒ Object

#tag_signatures ⇒ Object (readonly)

#tag_store ⇒ Object (readonly)

#user_agent ⇒ Object

#verbose ⇒ Object

#visited_urls_by_crawler ⇒ Object (readonly)