Class: CobwebCrawler
- Inherits:
-
Object
- Object
- CobwebCrawler
- Defined in:
- lib/cobweb_crawler.rb
Overview
CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
Instance Method Summary collapse
-
#crawl(base_url, crawl_options = {}, &block) ⇒ Object
Initiates a crawl starting at the base_url and applying the options supplied.
-
#initialize(options = {}) ⇒ CobwebCrawler
constructor
See README for more information on options available.
Constructor Details
#initialize(options = {}) ⇒ CobwebCrawler
See README for more information on options available
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/cobweb_crawler.rb', line 10 def initialize(={}) @options = @statistic = {} @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options if @options.has_key? :crawl_id @crawl_id = @options[:crawl_id] else @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s) @options[:crawl_id] = @crawl_id end @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options])) @options[:internal_urls] = [] if @options[:internal_urls].nil? @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)} @debug = @options[:debug] @stats = Stats.new(@options.merge(:crawl_id => @crawl_id)) if @options[:web_statistics] Server.start(@options) end @cobweb = Cobweb.new(@options) end |
Instance Method Details
#crawl(base_url, crawl_options = {}, &block) ⇒ Object
Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash’
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/cobweb_crawler.rb', line 37 def crawl(base_url, = {}, &block) @options[:base_url] = base_url unless @options.has_key? :base_url @options[:internal_urls] << base_url if @options[:internal_urls].empty? @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty? @crawl_options = @redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url) crawl_counter = @redis.scard("crawled").to_i queue_counter = @redis.scard("queued").to_i begin @stats.start_crawl(@options) while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter) thread = Thread.new do url = @redis.spop "queued" @options[:url] = url unless @redis.sismember("crawled", url.to_s) begin @stats.update_status("Requesting #{url}...") content = @cobweb.get(url) if content.nil? queue_counter = queue_counter - 1 #@redis.scard("queued").to_i else @stats.update_status("Processing #{url}...") @redis.sadd "crawled", url.to_s @redis.incr "crawl-counter" internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]) # select the link if its internal (eliminate external before expensive lookups in queued and crawled) cobweb_links = CobwebLinks.new(@options) internal_links = internal_links.select{|link| cobweb_links.internal?(link)} # reject the link if we've crawled it or queued it internal_links.reject!{|link| @redis.sismember("crawled", link)} internal_links.reject!{|link| @redis.sismember("queued", link)} internal_links.reject!{|link| link.nil? || link.empty?} internal_links.each do |link| puts "Added #{link.to_s} to queue" if @debug @redis.sadd "queued", link unless link.nil? children = @redis.hget("navigation", url) children = [] if children.nil? children << link @redis.hset "navigation", url, children queue_counter += 1 end crawl_counter = crawl_counter + 1 #@redis.scard("crawled").to_i queue_counter = queue_counter - 1 #@redis.scard("queued").to_i @stats.update_statistics(content, crawl_counter, queue_counter) @stats.update_status("Completed #{url}.") yield content, @stats.get_statistics if block_given? end rescue => e puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!" ap e ap e.backtrace end else puts "Already crawled #{@options[:url]}" if @debug end end thread.join end ensure @stats.end_crawl(@options) end @stats.get_statistics end |