Class: Wgit::Crawler

Inherits:
Object
  • Object
show all
Includes:
Assertable
Defined in:
lib/wgit/crawler.rb

Overview

Crawler class provides a means of crawling web URL’s. Note that any redirects will not be followed for during crawling functionality.

Constant Summary

Constants included from Assertable

Assertable::DEFAULT_DUCK_FAIL_MSG, Assertable::DEFAULT_TYPE_FAIL_MSG, Assertable::WRONG_METHOD_MSG

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Assertable

#assert_arr_types, #assert_respond_to, #assert_types

Constructor Details

#initialize(*urls) ⇒ Crawler



18
19
20
21
# File 'lib/wgit/crawler.rb', line 18

def initialize(*urls)
  self.urls = urls unless urls.nil?
   @docs = []
end

Instance Attribute Details

#docsObject (readonly)

Returns the value of attribute docs.



16
17
18
# File 'lib/wgit/crawler.rb', line 16

def docs
  @docs
end

#urlsObject

Returns the value of attribute urls.



16
17
18
# File 'lib/wgit/crawler.rb', line 16

def urls
  @urls
end

Instance Method Details

#<<(url) ⇒ Object



32
33
34
# File 'lib/wgit/crawler.rb', line 32

def <<(url)
    add_url(url)
end

#[](*urls) ⇒ Object



28
29
30
# File 'lib/wgit/crawler.rb', line 28

def [](*urls)
    self.urls = urls unless urls.nil?
end

#crawl_site(base_url = @urls.first, &block) ⇒ Object Also known as: crawl_r

Crawls an entire site by recursively going through its internal_links. Also yield(doc) for each crawled doc if a block is provided. A block is the only way to interact with the crawled docs. Returns a unique array of external urls collected from the site or nil if the base_url could not be crawled successfully.



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/wgit/crawler.rb', line 65

def crawl_site(base_url = @urls.first, &block)
  assert_type(base_url, Url)

  doc = crawl_url(base_url, &block)
  return nil if doc.nil?

  crawled_urls  = []
  external_urls = doc.external_links
  internal_urls = doc.internal_links

  return doc.external_links.uniq if internal_urls.empty?

  loop do
    internal_urls.uniq! unless internal_urls.uniq.nil?
  
    links = internal_urls - crawled_urls
    break if links.empty?
  
    links.each do |link|
      doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
      crawled_urls << link
      next if doc.nil?
      internal_urls.concat(doc.internal_links)
      external_urls.concat(doc.external_links)
    end
  end

  external_urls.uniq
end

#crawl_url(url = @urls.first, &block) ⇒ Object

Also yield(doc) if a block is provided. The doc is passed to the block regardless of the crawl success so the doc.url can be used if needed.



51
52
53
54
55
56
57
58
# File 'lib/wgit/crawler.rb', line 51

def crawl_url(url = @urls.first, &block)
   assert_type(url, Url)
  markup = fetch(url)
   url.crawled = true
   doc = Wgit::Document.new(url, markup)
   block.call(doc) if block_given?
   doc.empty? ? nil : doc
end

#crawl_urls(urls = @urls, &block) ⇒ Object Also known as: crawl

Crawls individual urls, not entire sites. Returns the last crawled doc. Yields each doc to the provided block or adds each doc to @docs which can be accessed by Crawler#docs after the method returns.



40
41
42
43
44
45
46
# File 'lib/wgit/crawler.rb', line 40

def crawl_urls(urls = @urls, &block)
   raise "No urls to crawl" unless urls
   @docs = []
   doc = nil
   Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
   doc ? doc : @docs.last
end