Class: Creepycrawler::Site

Inherits:
Object
  • Object
show all
Defined in:
lib/creepy-crawler/site.rb

Overview

object to handle the discovery of our site through crawling

Constant Summary collapse

DEFAULT_OPTIONS =
{
  # whether to print crawling information
  :verbose => true,
  # whether to obey robots.txt
  :obey_robots => true,
  # maximum number of pages to crawl, value of nil will attempt to crawl all pages
  :max_page_crawl => nil,
  # should pages be written to the database. Likely only used for testing, but may be used if you only wanted to get at the broken_links data
  :graph_to_neo4j => true
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Site

Returns a new instance of Site.



40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/creepy-crawler/site.rb', line 40

def initialize(url, options = {})
  response = open(url, :allow_redirections => :all)
  url_parsed = Addressable::URI.parse(response.base_uri)
  @domain = url_parsed.host
  @url = url_parsed.to_s
  @page_crawl_count = 0
  @options = options
  # add the initial url to our crawl queue
  @crawl_queue = [@url] 
  @broken_links = [] 
  @visited_queue = []
  @graph = Creepycrawler::Graph.new
end

Instance Attribute Details

holds dead or broken links



20
21
22
# File 'lib/creepy-crawler/site.rb', line 20

def broken_links
  @broken_links
end

#crawl_queueObject (readonly)

queue used to store discovered pages and crawl the site



12
13
14
# File 'lib/creepy-crawler/site.rb', line 12

def crawl_queue
  @crawl_queue
end

#domainObject (readonly)

the site domain



6
7
8
# File 'lib/creepy-crawler/site.rb', line 6

def domain
  @domain
end

#optionsObject (readonly)

hash of additional options to be passed in



10
11
12
# File 'lib/creepy-crawler/site.rb', line 10

def options
  @options
end

#page_crawl_countObject (readonly)

number of pages crawled



16
17
18
# File 'lib/creepy-crawler/site.rb', line 16

def page_crawl_count
  @page_crawl_count
end

#root_nodeObject (readonly)

holds the root node information



18
19
20
# File 'lib/creepy-crawler/site.rb', line 18

def root_node
  @root_node
end

#urlObject (readonly)

url the crawl began with



8
9
10
# File 'lib/creepy-crawler/site.rb', line 8

def url
  @url
end

#visited_queueObject (readonly)

queue used to store visited pages



14
15
16
# File 'lib/creepy-crawler/site.rb', line 14

def visited_queue
  @visited_queue
end

Instance Method Details

#crawlObject



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/creepy-crawler/site.rb', line 54

def crawl
  # merge default and passed in options into one hash 
  @options = DEFAULT_OPTIONS.merge(@options)

  # begin crawl loop
  loop do
    # break if we have crawled all sites, or reached :max_page_crawl
    break if @crawl_queue.empty? or (!options[:max_page_crawl].nil? and @page_crawl_count >= @options[:max_page_crawl])
    
    begin
      # pull next page from crawl_queue and setup page
      page = Page.new(@crawl_queue.shift)
      
      # add url to visited queue to keep track of where we have been
      @visited_queue.push(page.url.to_s)
      
      # respect robots.txt
      if @options[:obey_robots] and page.robots_disallowed? 
        puts "Not crawling #{page.url} per Robots.txt request" if options[:verbose]
        next
      end

      puts "Crawling and indexing: #{page.url}" if @options[:verbose]
      
      # retrieve page
      page.fetch
    
      current_page_node = @graph.add_page(page.url) if @options[:graph_to_neo4j] 
      #todo: fix this. on first run current_page_node is a hash. subsequent is an array of hashes
      @root_node = current_page_node if @page_crawl_count == 0 and @options[:graph_to_neo4j]
      
      # Loop through all links on the current page
      page.links.each do |link|

        # add to crawl queue - only push local links, links that do not yet exist in the queue and links that haven't been visted
        @crawl_queue.push(link) if local? link and !@crawl_queue.include? link and !@visited_queue.include? link.to_s

        # add link page to graph
        current_link_node = @graph.add_page(link) if @options[:graph_to_neo4j]

        # create a links_to relationship from the current page node to link node
        @graph.create_relationship("links_to", current_page_node, current_link_node) if @options[:graph_to_neo4j]
      end
    rescue => e
      puts "Exception thrown: #{e.message} - Skipping Page" if @options[:verbose]
      @broken_links.push(page.url)
      next
    end
    @page_crawl_count += 1
  end # end of loop

  return self
end

#local?(link) ⇒ Boolean

is link local to site?

Returns:

  • (Boolean)


109
110
111
112
113
# File 'lib/creepy-crawler/site.rb', line 109

def local?(link)
  uri = Addressable::URI.parse(link)
  return true if uri.host == @domain
  return false
end