Method: ContentLinkParser#all_links

Defined in:
lib/content_link_parser.rb

Returns an array of all absolutized links, specify :valid_schemes in options to limit to certain schemes. Also filters repeating folders (ie if the crawler got in a link loop situation)



44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/content_link_parser.rb', line 44

def all_links(options = {})    
  options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
  data = link_data
  links = data.keys.map{|key| data[key]}.flatten.uniq
  links = links.map{|link| UriHelper.join_no_fragment(@url, UriHelper.join_no_fragment(@base_url, link))}
    .reject(&:nil?)
    .map(&:to_s)
  links = links.reject{|link| link =~ /\/([^\/]+?)\/\1\// }
  links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ }    
  links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
  links
end