Class: Kudzu::Agent::UrlExtractor::ForHTML

Inherits:
Object
  • Object
show all
Defined in:
lib/kudzu/agent/url_extractor.rb

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ ForHTML

Returns a new instance of ForHTML.



49
50
51
# File 'lib/kudzu/agent/url_extractor.rb', line 49

def initialize(config)
  @config = config
end

Instance Method Details

#extract(response) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/kudzu/agent/url_extractor.rb', line 53

def extract(response)
  doc = response.parsed_doc
  return [] if @config.respect_nofollow && nofollow?(doc)

  if (filter = @config.find_filter(response.url))
    if filter.allow_element
      doc = doc.search(*Array(filter.allow_element))
    end
    if filter.deny_element
      doc = doc.dup
      doc.search(*Array(filter.deny_element)).remove
    end
  end

  refs = from_html(doc) + from_meta(doc)
  refs.reject { |ref| ref.url.nil? || ref.url.empty? }
end