Class: Creepycrawler::Site
- Inherits:
-
Object
- Object
- Creepycrawler::Site
- Defined in:
- lib/creepy-crawler/site.rb
Overview
object to handle the discovery of our site through crawling
Constant Summary collapse
- DEFAULT_OPTIONS =
{ # whether to print crawling information :verbose => true, # whether to obey robots.txt :obey_robots => true, # maximum number of pages to crawl, value of nil will attempt to crawl all pages :max_page_crawl => nil, # should pages be written to the database. Likely only used for testing, but may be used if you only wanted to get at the broken_links data :graph_to_neo4j => true }
Instance Attribute Summary collapse
-
#broken_links ⇒ Object
readonly
holds dead or broken links.
-
#crawl_queue ⇒ Object
readonly
queue used to store discovered pages and crawl the site.
-
#domain ⇒ Object
readonly
the site domain.
-
#options ⇒ Object
readonly
hash of additional options to be passed in.
-
#page_crawl_count ⇒ Object
readonly
number of pages crawled.
-
#root_node ⇒ Object
readonly
holds the root node information.
-
#url ⇒ Object
readonly
url the crawl began with.
-
#visited_queue ⇒ Object
readonly
queue used to store visited pages.
Instance Method Summary collapse
- #crawl ⇒ Object
-
#initialize(url, options = {}) ⇒ Site
constructor
A new instance of Site.
-
#local?(link) ⇒ Boolean
is link local to site?.
Constructor Details
#initialize(url, options = {}) ⇒ Site
Returns a new instance of Site.
40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/creepy-crawler/site.rb', line 40 def initialize(url, = {}) response = open(url, :allow_redirections => :all) url_parsed = Addressable::URI.parse(response.base_uri) @domain = url_parsed.host @url = url_parsed.to_s @page_crawl_count = 0 @options = # add the initial url to our crawl queue @crawl_queue = [@url] @broken_links = [] @visited_queue = [] @graph = Creepycrawler::Graph.new end |
Instance Attribute Details
#broken_links ⇒ Object (readonly)
holds dead or broken links
20 21 22 |
# File 'lib/creepy-crawler/site.rb', line 20 def broken_links @broken_links end |
#crawl_queue ⇒ Object (readonly)
queue used to store discovered pages and crawl the site
12 13 14 |
# File 'lib/creepy-crawler/site.rb', line 12 def crawl_queue @crawl_queue end |
#domain ⇒ Object (readonly)
the site domain
6 7 8 |
# File 'lib/creepy-crawler/site.rb', line 6 def domain @domain end |
#options ⇒ Object (readonly)
hash of additional options to be passed in
10 11 12 |
# File 'lib/creepy-crawler/site.rb', line 10 def @options end |
#page_crawl_count ⇒ Object (readonly)
number of pages crawled
16 17 18 |
# File 'lib/creepy-crawler/site.rb', line 16 def page_crawl_count @page_crawl_count end |
#root_node ⇒ Object (readonly)
holds the root node information
18 19 20 |
# File 'lib/creepy-crawler/site.rb', line 18 def root_node @root_node end |
#url ⇒ Object (readonly)
url the crawl began with
8 9 10 |
# File 'lib/creepy-crawler/site.rb', line 8 def url @url end |
#visited_queue ⇒ Object (readonly)
queue used to store visited pages
14 15 16 |
# File 'lib/creepy-crawler/site.rb', line 14 def visited_queue @visited_queue end |
Instance Method Details
#crawl ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/creepy-crawler/site.rb', line 54 def crawl # merge default and passed in options into one hash @options = DEFAULT_OPTIONS.merge(@options) # begin crawl loop loop do # break if we have crawled all sites, or reached :max_page_crawl break if @crawl_queue.empty? or (![:max_page_crawl].nil? and @page_crawl_count >= @options[:max_page_crawl]) begin # pull next page from crawl_queue and setup page page = Page.new(@crawl_queue.shift) # add url to visited queue to keep track of where we have been @visited_queue.push(page.url.to_s) # respect robots.txt if @options[:obey_robots] and page.robots_disallowed? puts "Not crawling #{page.url} per Robots.txt request" if [:verbose] next end puts "Crawling and indexing: #{page.url}" if @options[:verbose] # retrieve page page.fetch current_page_node = @graph.add_page(page.url) if @options[:graph_to_neo4j] #todo: fix this. on first run current_page_node is a hash. subsequent is an array of hashes @root_node = current_page_node if @page_crawl_count == 0 and @options[:graph_to_neo4j] # Loop through all links on the current page page.links.each do |link| # add to crawl queue - only push local links, links that do not yet exist in the queue and links that haven't been visted @crawl_queue.push(link) if local? link and !@crawl_queue.include? link and !@visited_queue.include? link.to_s # add link page to graph current_link_node = @graph.add_page(link) if @options[:graph_to_neo4j] # create a links_to relationship from the current page node to link node @graph.create_relationship("links_to", current_page_node, current_link_node) if @options[:graph_to_neo4j] end rescue => e puts "Exception thrown: #{e.} - Skipping Page" if @options[:verbose] @broken_links.push(page.url) next end @page_crawl_count += 1 end # end of loop return self end |
#local?(link) ⇒ Boolean
is link local to site?
109 110 111 112 113 |
# File 'lib/creepy-crawler/site.rb', line 109 def local?(link) uri = Addressable::URI.parse(link) return true if uri.host == @domain return false end |