Class: Grell::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/grell/crawler.rb

Overview

This is the class that starts and controls the crawling

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Crawler

Creates a crawler options allows :logger to point to an object with the same interface than Logger in the standard library



10
11
12
13
14
15
16
17
18
# File 'lib/grell/crawler.rb', line 10

def initialize(options = {})
  if options[:logger]
    Grell.logger = options[:logger]
  else
    Grell.logger = Logger.new(STDOUT)
  end

  @driver = CapybaraDriver.setup(options)
end

Instance Attribute Details

#collectionObject (readonly)

Returns the value of attribute collection.



6
7
8
# File 'lib/grell/crawler.rb', line 6

def collection
  @collection
end

Instance Method Details

#blacklist(list) ⇒ Object

Setups a blacklist filter, allows a regexp, string or array of either to be matched.



33
34
35
# File 'lib/grell/crawler.rb', line 33

def blacklist(list)
  @blacklist_regexp = Regexp.union(list)
end

#crawl(site, block) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/grell/crawler.rb', line 50

def crawl(site, block)
  Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
  site.navigate
  filter!(site.links)
  add_redirect_url(site)

  if block # The user of this block can send us a :retry to retry accessing the page
    while crawl_block(block, site) == :retry
      Grell.logger.info "Retrying our visit to #{site.url}"
      site.navigate
      filter!(site.links)
      add_redirect_url(site)
    end
  end

  site.links.each do |url|
    @collection.create_page(url, site.id)
  end
end

#restartObject

Restarts the PhantomJS process without modifying the state of visited and discovered pages.



21
22
23
24
25
# File 'lib/grell/crawler.rb', line 21

def restart
  Grell.logger.info "GRELL is restarting"
  @driver.restart
  Grell.logger.info "GRELL has restarted"
end

#start_crawling(url, options = {}, &block) ⇒ Object

Main method, it starts crawling on the given URL and calls a block for each of the pages found.



38
39
40
41
42
43
44
45
46
47
48
# File 'lib/grell/crawler.rb', line 38

def start_crawling(url, options = {}, &block)
  Grell.logger.info "GRELL Started crawling"
  @collection = PageCollection.new(options[:add_match_block] || default_add_match)
  @collection.create_page(url, nil)

  while !@collection.discovered_pages.empty?
    crawl(@collection.next_page, block)
  end

  Grell.logger.info "GRELL finished crawling"
end

#whitelist(list) ⇒ Object

Setups a whitelist filter, allows a regexp, string or array of either to be matched.



28
29
30
# File 'lib/grell/crawler.rb', line 28

def whitelist(list)
  @whitelist_regexp = Regexp.union(list)
end