Class: Grell::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/grell/crawler.rb

Overview

This is the class that starts and controls the crawling

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Crawler

Creates a crawler options allows :logger to point to an object with the same interface than Logger in the standard library



10
11
12
13
14
15
16
17
18
# File 'lib/grell/crawler.rb', line 10

def initialize(options = {})
  if options[:logger]
    Grell.logger = options[:logger]
  else
    Grell.logger = Logger.new(STDOUT)
  end

  @driver = CapybaraDriver.setup(options)
end

Instance Attribute Details

#collectionObject (readonly)

Returns the value of attribute collection.



6
7
8
# File 'lib/grell/crawler.rb', line 6

def collection
  @collection
end

Instance Method Details

#blacklist(list) ⇒ Object

Setups a blacklist filter, allows a regexp, string or array of either to be matched.



39
40
41
# File 'lib/grell/crawler.rb', line 39

def blacklist(list)
  @blacklist_regexp = Regexp.union(list)
end

#crawl(site, block) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/grell/crawler.rb', line 56

def crawl(site, block)
  Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
  site.navigate
  filter!(site.links)
  add_redirect_url(site)

  if block # The user of this block can send us a :retry to retry accessing the page
    while crawl_block(block, site) == :retry
      Grell.logger.info "Retrying our visit to #{site.url}"
      site.navigate
      filter!(site.links)
      add_redirect_url(site)
    end
  end

  site.links.each do |url|
    @collection.create_page(url, site.id)
  end
end

#quitObject

Quits the poltergeist driver.



28
29
30
31
# File 'lib/grell/crawler.rb', line 28

def quit
  Grell.logger.info "GRELL is quitting the poltergeist driver"
  @driver.quit
end

#restartObject

Restarts the PhantomJS process without modifying the state of visited and discovered pages.



21
22
23
24
25
# File 'lib/grell/crawler.rb', line 21

def restart
  Grell.logger.info "GRELL is restarting"
  @driver.restart
  Grell.logger.info "GRELL has restarted"
end

#start_crawling(url, options = {}, &block) ⇒ Object

Main method, it starts crawling on the given URL and calls a block for each of the pages found.



44
45
46
47
48
49
50
51
52
53
54
# File 'lib/grell/crawler.rb', line 44

def start_crawling(url, options = {}, &block)
  Grell.logger.info "GRELL Started crawling"
  @collection = PageCollection.new(options[:add_match_block] || default_add_match)
  @collection.create_page(url, nil)

  while !@collection.discovered_pages.empty?
    crawl(@collection.next_page, block)
  end

  Grell.logger.info "GRELL finished crawling"
end

#whitelist(list) ⇒ Object

Setups a whitelist filter, allows a regexp, string or array of either to be matched.



34
35
36
# File 'lib/grell/crawler.rb', line 34

def whitelist(list)
  @whitelist_regexp = Regexp.union(list)
end