Class: Spiderz
- Inherits:
-
Object
- Object
- Spiderz
- Defined in:
- lib/spiderz.rb
Constant Summary collapse
- VERSION =
"0.1.1"
Instance Method Summary collapse
- #bookmark?(href) ⇒ Boolean
- #completed(&action) ⇒ Object
- #crawl(url) ⇒ Object
- #external?(href) ⇒ Boolean
- #failure(&action) ⇒ Object
-
#initialize(root) ⇒ Spiderz
constructor
root should be like www.google.com (i.e. with http://).
- #mail?(href) ⇒ Boolean
- #page_links(url) ⇒ Object
- #skip(&action) ⇒ Object
- #started(&action) ⇒ Object
- #success(&action) ⇒ Object
Constructor Details
#initialize(root) ⇒ Spiderz
root should be like www.google.com (i.e. with http://)
10 11 12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/spiderz.rb', line 10 def initialize(root) @crawled = {} @root = root @success = Proc.new { |url, doc| puts "Successfully read url: #{url}" } @failure = Proc.new { |url| puts "failure to read/parse url: #{url}" } @started = Proc.new { |url| puts "Started crawling from url: #{url}" } @completed = Proc.new { |url| puts "Crawling complete" } @skip = Proc.new do |href| !href || (external?(href) || mail?(href) || bookmark?(href)) end end |
Instance Method Details
#bookmark?(href) ⇒ Boolean
40 41 42 |
# File 'lib/spiderz.rb', line 40 def bookmark? href href.match(/^#/) end |
#completed(&action) ⇒ Object
52 53 54 |
# File 'lib/spiderz.rb', line 52 def completed &action @completed = action end |
#crawl(url) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/spiderz.rb', line 24 def crawl(url) @started.call(url) @to_crawl = [url] while(@to_crawl.length > 0) @to_crawl += page_links(@to_crawl.shift) end @completed.call(url) end |
#external?(href) ⇒ Boolean
36 37 38 |
# File 'lib/spiderz.rb', line 36 def external? href href.match("[a-z]+://") && !href.match(@root) end |
#failure(&action) ⇒ Object
56 57 58 |
# File 'lib/spiderz.rb', line 56 def failure &action @failure = action end |
#mail?(href) ⇒ Boolean
44 45 46 |
# File 'lib/spiderz.rb', line 44 def mail? href href.match("mailto") end |
#page_links(url) ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/spiderz.rb', line 68 def page_links url #puts url return [] if @crawled[url] @crawled[url] = true begin doc = Hpricot(open(@root+url)) rescue @failure.call(url) return [] end @success.call(url, doc) links = doc/"a" #find links urls = links.map do |a| a.attributes["href"] end urls.delete_if do |url| @crawled[url] || @skip.call(url) end urls end |
#skip(&action) ⇒ Object
64 65 66 |
# File 'lib/spiderz.rb', line 64 def skip &action @skip = action end |
#started(&action) ⇒ Object
48 49 50 |
# File 'lib/spiderz.rb', line 48 def started &action @started = action end |
#success(&action) ⇒ Object
60 61 62 |
# File 'lib/spiderz.rb', line 60 def success &action @success = action end |