Class: Magellan::Cartographer

Inherits:
Object
  • Object
show all
Includes:
Observable
Defined in:
lib/magellan/cartographer.rb

Instance Method Summary collapse

Constructor Details

#initialize(settings) ⇒ Cartographer

Returns a new instance of Cartographer.



8
9
10
11
12
13
14
15
# File 'lib/magellan/cartographer.rb', line 8

def initialize(settings)
  @origin_url = settings[:origin_url]
  @known_urls = settings[:ignored_urls]
  @domains = settings[:domains].map {|domain| URI.parse(domain)}
  @depth_to_explore = settings[:depth_to_explore]
  @links_we_want_to_explore = settings[:links_to_explore]
  @trace = settings[:trace]
end

Instance Method Details

#a_domain_we_care_about?(url) ⇒ Boolean

Returns:

  • (Boolean)


52
53
54
55
56
57
58
# File 'lib/magellan/cartographer.rb', line 52

def a_domain_we_care_about?(url)
  begin
    !@domains.select { |domain| URI.parse(url).host == domain.host }.empty?
  rescue
    !@domains.select { |domain| url.gsub(/https*:\/\//,'').starts_with?(domain.host) }.empty?
  end
end

#crawlObject



17
18
19
# File 'lib/magellan/cartographer.rb', line 17

def crawl
  recursive_explore([@origin_url],1)
end

#i_am_not_too_deep?(depth) ⇒ Boolean

Returns:

  • (Boolean)


48
49
50
# File 'lib/magellan/cartographer.rb', line 48

def i_am_not_too_deep?(depth)
  depth <= @depth_to_explore
end

#i_have_seen_this_url_before?(url) ⇒ Boolean

Returns:

  • (Boolean)


44
45
46
# File 'lib/magellan/cartographer.rb', line 44

def i_have_seen_this_url_before?(url)
  @known_urls.include?(url.remove_fragment)
end

#recursive_explore(urls, depth) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/magellan/cartographer.rb', line 21

def recursive_explore(urls,depth)
  if i_am_not_too_deep?(depth)
    $stdout.puts "exploring:\n#{urls.join("\n")}" if @trace
    results = Explorer.new(urls,@links_we_want_to_explore).explore
    results.each do |result|
      changed
      notify_observers(Time.now, result)
      @known_urls << result.url.remove_fragment
      @known_urls << result.destination_url.remove_fragment
      remove_javascript_and_print_warning result
    end

    all_urls = results.map {|result| result.absolute_linked_resources }.flatten
    all_urls.uniq!
    #TODO: handle any other url parsing error
    all_urls.delete_if { |url| !a_domain_we_care_about?(url)}
    all_urls.delete_if { |url| i_have_seen_this_url_before?(url)}
    all_urls.chunk(40).each do |result_chunk|
      recursive_explore(result_chunk,depth+1)
    end
  end
end

#remove_javascript_and_print_warning(result) ⇒ Object



60
61
62
63
64
65
66
67
# File 'lib/magellan/cartographer.rb', line 60

def remove_javascript_and_print_warning(result)
  result.linked_resources.delete_if do |linked_resource|
    starts_with_javascript = linked_resource.downcase.starts_with?("javascript:")
    #TODO: put this in the logger
    #$stderr.puts "Found obtrusive javascript: #{linked_resource} on page #{result.url}" if starts_with_javascript
    starts_with_javascript
  end
end