Class: Crawler::Webcrawler

Inherits:
Object
  • Object
show all
Includes:
Observable
Defined in:
lib/crawler/webcrawler.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Webcrawler

Accepts the following options:

  • timeout – Time limit for the crawl operation, after which a Timeout::Error exception is raised.



21
22
23
24
25
26
27
28
29
30
# File 'lib/crawler/webcrawler.rb', line 21

def initialize(options={})
  @crawled = Set.new
  @queue = []
  @options = {
    :timeout => 1.0/0, #Infinity
    :external => false,
    :exclude => []
  }.merge(options)
  
end

Instance Attribute Details

#crawledObject

Set of all URIs which have been crawled



13
14
15
# File 'lib/crawler/webcrawler.rb', line 13

def crawled
  @crawled
end

#optionsObject

Hash of options



17
18
19
# File 'lib/crawler/webcrawler.rb', line 17

def options
  @options
end

#queueObject

Queue of URIs to be crawled. Array which acts as a LIFO queue.



15
16
17
# File 'lib/crawler/webcrawler.rb', line 15

def queue
  @queue
end

Instance Method Details

#crawl(start_uri) ⇒ Object

Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm.



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/crawler/webcrawler.rb', line 33

def crawl(start_uri)
  start_uri = start_uri.normalize
  @queue << start_uri
  
  timeout(@options[:timeout]) {
    while(uri = @queue.shift)
      
      Net::HTTP.start(uri.host, uri.port) do |http|
        
        head = http.head(uri.path)
        next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
        
        resp = http.get(uri.path)

        changed
        notify_observers(resp, uri)
  
        html = Nokogiri.parse(resp.body)
        a_tags = html.search("a")
        @queue = @queue + a_tags.collect do |t|
          begin
            next_uri = uri + t.attribute("href").to_s.strip
          rescue
            nil
          end
        end
        @queue = @queue.compact.uniq
        @queue = @queue.reject {|u| 
          @crawled.include?(u) or
          u == uri or
          !(u.kind_of?(URI::HTTP)) or
          (u.host != uri.host and !@options[:external]) or
          (@options[:exclude].any? { |excl| u.path.include?(excl)})
        }
      end
      @crawled << uri
    end
  }
  
end