Class: Kudzu::Agent::UrlFilterer
- Inherits:
-
Object
- Object
- Kudzu::Agent::UrlFilterer
- Defined in:
- lib/kudzu/agent/url_filterer.rb
Instance Method Summary collapse
- #allowed?(uri, base_uri, filter: nil) ⇒ Boolean
- #filter(refs, base_url) ⇒ Object
-
#initialize(config, robots = nil) ⇒ UrlFilterer
constructor
A new instance of UrlFilterer.
Constructor Details
#initialize(config, robots = nil) ⇒ UrlFilterer
Returns a new instance of UrlFilterer.
4 5 6 7 |
# File 'lib/kudzu/agent/url_filterer.rb', line 4 def initialize(config, robots = nil) @config = config @robots = robots end |
Instance Method Details
#allowed?(uri, base_uri, filter: nil) ⇒ Boolean
24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/kudzu/agent/url_filterer.rb', line 24 def allowed?(uri, base_uri, filter: nil) uri = Addressable::URI.parse(uri) if uri.is_a?(String) base_uri = Addressable::URI.parse(base_uri) if base_uri.is_a?(String) filter ||= @config.find_filter(base_uri) return true unless filter focused_host?(uri, base_uri, filter) && focused_descendants?(uri, base_uri, filter) && allowed_url?(uri, filter) && allowed_host?(uri, filter) && allowed_path?(uri, filter) && allowed_ext?(uri, filter) && allowed_by_robots?(uri) end |
#filter(refs, base_url) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/kudzu/agent/url_filterer.rb', line 9 def filter(refs, base_url) base_uri = Addressable::URI.parse(base_url) filter = @config.find_filter(base_uri) refs.select do |ref| if allowed?(ref.uri, base_uri, filter: filter) Kudzu.log :debug, "passed url: #{ref.url}" true else Kudzu.log :debug, "dropped url: #{ref.url}" false end end end |