Class: CobwebCrawler
- Inherits:
-
Object
- Object
- CobwebCrawler
- Defined in:
- lib/cobweb_crawler.rb
Overview
CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
Instance Method Summary collapse
-
#crawl(base_url, crawl_options = {}, &block) ⇒ Object
Initiates a crawl starting at the base_url and applying the options supplied.
-
#initialize(options = {}) ⇒ CobwebCrawler
constructor
See README for more information on options available.
Constructor Details
#initialize(options = {}) ⇒ CobwebCrawler
See README for more information on options available
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/cobweb_crawler.rb', line 10 def initialize(={}) @options = @statistic = {} @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options if @options.has_key? :crawl_id @crawl_id = @options[:crawl_id] else @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s) @options[:crawl_id] = @crawl_id end @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options])) @options[:internal_urls] = [] if @options[:internal_urls].nil? @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)} @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external @debug = @options[:debug] @stats = Stats.new(@options.merge(:crawl_id => @crawl_id)) if @options[:web_statistics] Server.start(@options) end @cobweb = Cobweb.new(@options) end |
Instance Method Details
#crawl(base_url, crawl_options = {}, &block) ⇒ Object
Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash’
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# File 'lib/cobweb_crawler.rb', line 40 def crawl(base_url, = {}, &block) @options[:base_url] = base_url unless @options.has_key? :base_url @options[:internal_urls] << base_url if @options[:internal_urls].empty? @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty? @crawl_options = @redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url) crawl_counter = @redis.scard("crawled").to_i queue_counter = @redis.scard("queued").to_i begin @stats.start_crawl(@options) while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter) thread = Thread.new do url = @redis.spop "queued" queue_counter = 0 if url.nil? @options[:url] = url unless @redis.sismember("crawled", url.to_s) begin @stats.update_status("Requesting #{url}...") content = @cobweb.get(url) unless url.nil? if content.nil? queue_counter = queue_counter - 1 #@redis.scard("queued").to_i else @stats.update_status("Processing #{url}...") @redis.sadd "crawled", url.to_s @redis.incr "crawl-counter" internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]) # select the link if its internal (eliminate external before expensive lookups in queued and crawled) cobweb_links = CobwebLinks.new(@options) internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))} all_internal_links = internal_links # reject the link if we've crawled it or queued it internal_links.reject!{|link| @redis.sismember("crawled", link)} internal_links.reject!{|link| @redis.sismember("queued", link)} internal_links.reject!{|link| link.nil? || link.empty?} internal_links.each do |link| puts "Added #{link.to_s} to queue" if @debug @redis.sadd "queued", link unless link.nil? children = @redis.hget("navigation", url) children = [] if children.nil? children << link @redis.hset "navigation", url, children queue_counter += 1 end if @options[:store_refered_url] all_internal_links.each do |link| @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url) end end crawl_counter = @redis.scard("crawled").to_i queue_counter = @redis.scard("queued").to_i @stats.update_statistics(content, crawl_counter, queue_counter) @stats.update_status("Completed #{url}.") yield content, @stats.get_statistics if block_given? end rescue => e puts "Error loading #{url}: #{e}" #puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!" #ap e #ap e.backtrace ensure crawl_counter = @redis.scard("crawled").to_i queue_counter = @redis.scard("queued").to_i end else puts "Already crawled #{@options[:url]}" if @debug end end thread.join end ensure @stats.end_crawl(@options) end @stats end |