Class: CobwebCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/cobweb_crawler.rb

Overview

CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ CobwebCrawler

See README for more information on options available



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/cobweb_crawler.rb', line 10

def initialize(options={})
  @options = options
  
  @statistic = {}
  
  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
  if @options.has_key? :crawl_id
    @crawl_id = @options[:crawl_id]
  else
    @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
    @options[:crawl_id] = @crawl_id
  end
  
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}

  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
  
  @debug = @options[:debug]
  
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
  if @options[:web_statistics]
    Server.start(@options)
  end
  
  @cobweb = Cobweb.new(@options)
end

Instance Method Details

#crawl(base_url, crawl_options = {}, &block) ⇒ Object

Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash’



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/cobweb_crawler.rb', line 40

def crawl(base_url, crawl_options = {}, &block)
  @options[:base_url] = base_url unless @options.has_key? :base_url
  
  @options[:internal_urls] << base_url if @options[:internal_urls].empty?
  @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?
  
  @crawl_options = crawl_options
  
  @redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
  crawl_counter = @redis.scard("crawled").to_i
  queue_counter = @redis.scard("queued").to_i

  begin
    @stats.start_crawl(@options)
    while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)      
      thread = Thread.new do
      

        url = @redis.spop "queued"
        queue_counter = 0 if url.nil?

        @options[:url] = url
        unless @redis.sismember("crawled", url.to_s)
          begin
            @stats.update_status("Requesting #{url}...")
            content = @cobweb.get(url) unless url.nil?
            if content.nil?
              queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
            else
              @stats.update_status("Processing #{url}...")

              @redis.sadd "crawled", url.to_s
              @redis.incr "crawl-counter" 
            
              internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])

              # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
              cobweb_links = CobwebLinks.new(@options)

              internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}

              all_internal_links = internal_links
              
              # reject the link if we've crawled it or queued it
              internal_links.reject!{|link| @redis.sismember("crawled", link)}
              internal_links.reject!{|link| @redis.sismember("queued", link)}
              internal_links.reject!{|link| link.nil? || link.empty?}
            
              internal_links.each do |link|
                puts "Added #{link.to_s} to queue" if @debug
                @redis.sadd "queued", link unless link.nil?
                children = @redis.hget("navigation", url)
                children = [] if children.nil?
                children << link
                @redis.hset "navigation", url, children
                queue_counter += 1
              end

              if @options[:store_refered_url]
                all_internal_links.each do |link|
                  @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url)
                end
              end
            
              crawl_counter = @redis.scard("crawled").to_i
              queue_counter = @redis.scard("queued").to_i
            
              @stats.update_statistics(content, crawl_counter, queue_counter)
              @stats.update_status("Completed #{url}.")
              yield content, @stats.get_statistics if block_given?
            end
          rescue => e
            puts "Error loading #{url}: #{e}"
            #puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
            #ap e
            #ap e.backtrace
          ensure
            crawl_counter = @redis.scard("crawled").to_i
            queue_counter = @redis.scard("queued").to_i
          end
        else
          puts "Already crawled #{@options[:url]}" if @debug
        end
      end
      thread.join
    end
  ensure
    @stats.end_crawl(@options)
  end
  @stats
end