Class: Kudzu::Agent::UrlExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/kudzu/agent/url_extractor.rb

Defined Under Namespace

Classes: ForHTML, ForXML

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ UrlExtractor

Returns a new instance of UrlExtractor.



4
5
6
# File 'lib/kudzu/agent/url_extractor.rb', line 4

def initialize(config)
  @config = config
end

Instance Method Details

#extract(response) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/kudzu/agent/url_extractor.rb', line 8

def extract(response)
  refs = if response.html?
           ForHTML.new(@config).extract(response)
         elsif response.xml?
           ForXML.new(@config).extract(response)
         else
           []
         end

  refs.each do |ref|
    ref.url = sanitize(ref.url)
    ref.url = normalize(ref.url, response.url)
  end
  refs.reject { |ref| ref.url.nil? }.uniq
end