Class: Creepycrawler::Site

Inherits:

Object

Object
Creepycrawler::Site

show all

Defined in:: lib/creepy-crawler/site.rb

Overview

object to handle the discovery of our site through crawling

Constant Summary collapse

DEFAULT_OPTIONS =

{
  # whether to print crawling information
  :verbose => true,
  # whether to obey robots.txt
  :obey_robots => true,
  # maximum number of pages to crawl, value of nil will attempt to crawl all pages
  :max_page_crawl => nil,
  # should pages be written to the database. Likely only used for testing, but may be used if you only wanted to get at the broken_links data
  :graph_to_neo4j => true
}

Instance Attribute Summary collapse

#broken_links ⇒ Object readonly

holds dead or broken links.
#crawl_queue ⇒ Object readonly

queue used to store discovered pages and crawl the site.
#domain ⇒ Object readonly

the site domain.
#options ⇒ Object readonly

hash of additional options to be passed in.
#page_crawl_count ⇒ Object readonly

number of pages crawled.
#root_node ⇒ Object readonly

holds the root node information.
#url ⇒ Object readonly

url the crawl began with.
#visited_queue ⇒ Object readonly

queue used to store visited pages.

Instance Method Summary collapse

#crawl ⇒ Object
#initialize(url, options = {}) ⇒ Site constructor

A new instance of Site.
#local?(link) ⇒ Boolean

is link local to site?.

Constructor Details

#initialize(url, options = {}) ⇒ `Site`

Returns a new instance of Site.

# File 'lib/creepy-crawler/site.rb', line 40

def initialize(url, options = {})
  response = open(url, :allow_redirections => :all)
  url_parsed = Addressable::URI.parse(response.base_uri)
  @domain = url_parsed.host
  @url = url_parsed.to_s
  @page_crawl_count = 0
  @options = options
  # add the initial url to our crawl queue
  @crawl_queue = [@url] 
  @broken_links = [] 
  @visited_queue = []
  @graph = Creepycrawler::Graph.new
end

Instance Attribute Details

#broken_links ⇒ `Object` (readonly)

holds dead or broken links



20
21
22

# File 'lib/creepy-crawler/site.rb', line 20

def broken_links
  @broken_links
end

#crawl_queue ⇒ `Object` (readonly)

queue used to store discovered pages and crawl the site



12
13
14

# File 'lib/creepy-crawler/site.rb', line 12

def crawl_queue
  @crawl_queue
end

#domain ⇒ `Object` (readonly)

the site domain



6
7
8

# File 'lib/creepy-crawler/site.rb', line 6

def domain
  @domain
end

#options ⇒ `Object` (readonly)

hash of additional options to be passed in



10
11
12

# File 'lib/creepy-crawler/site.rb', line 10

def options
  @options
end

#page_crawl_count ⇒ `Object` (readonly)

number of pages crawled



16
17
18

# File 'lib/creepy-crawler/site.rb', line 16

def page_crawl_count
  @page_crawl_count
end

#root_node ⇒ `Object` (readonly)

holds the root node information



18
19
20

# File 'lib/creepy-crawler/site.rb', line 18

def root_node
  @root_node
end

#url ⇒ `Object` (readonly)

url the crawl began with



8
9
10

# File 'lib/creepy-crawler/site.rb', line 8

def url
  @url
end

#visited_queue ⇒ `Object` (readonly)

queue used to store visited pages



14
15
16

# File 'lib/creepy-crawler/site.rb', line 14

def visited_queue
  @visited_queue
end

Instance Method Details

#crawl ⇒ `Object`

# File 'lib/creepy-crawler/site.rb', line 54

def crawl
  # merge default and passed in options into one hash 
  @options = DEFAULT_OPTIONS.merge(@options)

  # begin crawl loop
  loop do
    # break if we have crawled all sites, or reached :max_page_crawl
    break if @crawl_queue.empty? or (!options[:max_page_crawl].nil? and @page_crawl_count >= @options[:max_page_crawl])
    
    begin
      # pull next page from crawl_queue and setup page
      page = Page.new(@crawl_queue.shift)
      
      # add url to visited queue to keep track of where we have been
      @visited_queue.push(page.url.to_s)
      
      # respect robots.txt
      if @options[:obey_robots] and page.robots_disallowed? 
        puts "Not crawling #{page.url} per Robots.txt request" if options[:verbose]
        next
      end

      puts "Crawling and indexing: #{page.url}" if @options[:verbose]
      
      # retrieve page
      page.fetch
    
      current_page_node = @graph.add_page(page.url) if @options[:graph_to_neo4j] 
      #todo: fix this. on first run current_page_node is a hash. subsequent is an array of hashes
      @root_node = current_page_node if @page_crawl_count == 0 and @options[:graph_to_neo4j]
      
      # Loop through all links on the current page
      page.links.each do |link|

        # add to crawl queue - only push local links, links that do not yet exist in the queue and links that haven't been visted
        @crawl_queue.push(link) if local? link and !@crawl_queue.include? link and !@visited_queue.include? link.to_s

        # add link page to graph
        current_link_node = @graph.add_page(link) if @options[:graph_to_neo4j]

        # create a links_to relationship from the current page node to link node
        @graph.create_relationship("links_to", current_page_node, current_link_node) if @options[:graph_to_neo4j]
      end
    rescue => e
      puts "Exception thrown: #{e.message} - Skipping Page" if @options[:verbose]
      @broken_links.push(page.url)
      next
    end
    @page_crawl_count += 1
  end # end of loop

  return self
end

#local?(link) ⇒ `Boolean`

is link local to site?

Returns:

(Boolean)

# File 'lib/creepy-crawler/site.rb', line 109

def local?(link)
  uri = Addressable::URI.parse(link)
  return true if uri.host == @domain
  return false
end

Class: Creepycrawler::Site

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Site

Instance Attribute Details

#broken_links ⇒ Object (readonly)

#crawl_queue ⇒ Object (readonly)

#domain ⇒ Object (readonly)

#options ⇒ Object (readonly)

#page_crawl_count ⇒ Object (readonly)

#root_node ⇒ Object (readonly)

#url ⇒ Object (readonly)

#visited_queue ⇒ Object (readonly)