Class: Kudzu::Agent::UrlFilterer

Inherits:
Object
  • Object
show all
Defined in:
lib/kudzu/agent/url_filterer.rb

Instance Method Summary collapse

Constructor Details

#initialize(config, robots = nil) ⇒ UrlFilterer

Returns a new instance of UrlFilterer.


4
5
6
7
# File 'lib/kudzu/agent/url_filterer.rb', line 4

def initialize(config, robots = nil)
  @config = config
  @robots = robots
end

Instance Method Details

#allowed?(uri, base_uri, filter: nil) ⇒ Boolean

Returns:

  • (Boolean)

24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/kudzu/agent/url_filterer.rb', line 24

def allowed?(uri, base_uri, filter: nil)
  uri = Addressable::URI.parse(uri) if uri.is_a?(String)
  base_uri = Addressable::URI.parse(base_uri) if base_uri.is_a?(String)
  filter ||= @config.find_filter(base_uri)
  return true unless filter

  focused_host?(uri, base_uri, filter) &&
    focused_descendants?(uri, base_uri, filter) &&
    allowed_url?(uri, filter) &&
    allowed_host?(uri, filter) &&
    allowed_path?(uri, filter) &&
    allowed_ext?(uri, filter) &&
    allowed_by_robots?(uri)
end

#filter(refs, base_url) ⇒ Object


9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/kudzu/agent/url_filterer.rb', line 9

def filter(refs, base_url)
  base_uri = Addressable::URI.parse(base_url)
  filter = @config.find_filter(base_uri)

  refs.select do |ref|
    if allowed?(ref.uri, base_uri, filter: filter)
      Kudzu.log :debug, "passed url: #{ref.url}"
      true
    else
      Kudzu.log :debug, "dropped url: #{ref.url}"
      false
    end
  end
end