Class: Magellan::Cartographer
- Inherits:
-
Object
- Object
- Magellan::Cartographer
- Includes:
- Observable
- Defined in:
- lib/magellan/cartographer.rb
Instance Method Summary collapse
- #a_domain_we_care_about?(url) ⇒ Boolean
- #crawl ⇒ Object
- #i_am_not_too_deep?(depth) ⇒ Boolean
- #i_have_seen_this_url_before?(url) ⇒ Boolean
-
#initialize(settings) ⇒ Cartographer
constructor
A new instance of Cartographer.
- #recursive_explore(urls, depth) ⇒ Object
- #remove_javascript_and_print_warning(result) ⇒ Object
Constructor Details
#initialize(settings) ⇒ Cartographer
Returns a new instance of Cartographer.
8 9 10 11 12 13 14 15 |
# File 'lib/magellan/cartographer.rb', line 8 def initialize(settings) @origin_url = settings[:origin_url] @known_urls = settings[:ignored_urls] @domains = settings[:domains].map {|domain| URI.parse(domain)} @depth_to_explore = settings[:depth_to_explore] @links_we_want_to_explore = settings[:links_to_explore] @trace = settings[:trace] end |
Instance Method Details
#a_domain_we_care_about?(url) ⇒ Boolean
52 53 54 55 56 57 58 |
# File 'lib/magellan/cartographer.rb', line 52 def a_domain_we_care_about?(url) begin !@domains.select { |domain| URI.parse(url).host == domain.host }.empty? rescue !@domains.select { |domain| url.gsub(/https*:\/\//,'').starts_with?(domain.host) }.empty? end end |
#crawl ⇒ Object
17 18 19 |
# File 'lib/magellan/cartographer.rb', line 17 def crawl recursive_explore([@origin_url],1) end |
#i_am_not_too_deep?(depth) ⇒ Boolean
48 49 50 |
# File 'lib/magellan/cartographer.rb', line 48 def i_am_not_too_deep?(depth) depth <= @depth_to_explore end |
#i_have_seen_this_url_before?(url) ⇒ Boolean
44 45 46 |
# File 'lib/magellan/cartographer.rb', line 44 def i_have_seen_this_url_before?(url) @known_urls.include?(url.remove_fragment) end |
#recursive_explore(urls, depth) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/magellan/cartographer.rb', line 21 def recursive_explore(urls,depth) if i_am_not_too_deep?(depth) $stdout.puts "exploring:\n#{urls.join("\n")}" if @trace results = Explorer.new(urls,@links_we_want_to_explore).explore results.each do |result| changed notify_observers(Time.now, result) @known_urls << result.url.remove_fragment @known_urls << result.destination_url.remove_fragment remove_javascript_and_print_warning result end all_urls = results.map {|result| result.absolute_linked_resources }.flatten all_urls.uniq! #TODO: handle any other url parsing error all_urls.delete_if { |url| !a_domain_we_care_about?(url)} all_urls.delete_if { |url| i_have_seen_this_url_before?(url)} all_urls.chunk(40).each do |result_chunk| recursive_explore(result_chunk,depth+1) end end end |
#remove_javascript_and_print_warning(result) ⇒ Object
60 61 62 63 64 65 66 67 |
# File 'lib/magellan/cartographer.rb', line 60 def remove_javascript_and_print_warning(result) result.linked_resources.delete_if do |linked_resource| starts_with_javascript = linked_resource.downcase.starts_with?("javascript:") #TODO: put this in the logger #$stderr.puts "Found obtrusive javascript: #{linked_resource} on page #{result.url}" if starts_with_javascript starts_with_javascript end end |