Class: Spider::ExclusionParser

Inherits:
Object
  • Object
show all
Defined in:
lib/exclusion.rb

Constant Summary collapse

NULL_MATCH =
'*!*'.freeze
DISALLOW =
'disallow'.freeze
DELAY =
'crawl-delay'.freeze
ALLOW =
'allow'.freeze
MAX_DIRECTIVES =
1000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, agent = nil, status = 200) ⇒ ExclusionParser

Returns a new instance of ExclusionParser.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/exclusion.rb', line 27

def initialize(text, agent = nil, status = 200)
  @skip_list = []
  @agent_key = agent

  return if text.nil? || text.length.zero?

  if [401, 403].include? status
    @skip_list << [NULL_MATCH, true]
    return
  end

  begin
    config = parse_text(text)
    grab_list(config)
  rescue
  end
end

Instance Attribute Details

#wait_timeObject

Returns the value of attribute wait_time.



18
19
20
# File 'lib/exclusion.rb', line 18

def wait_time
  @wait_time
end

Instance Method Details

#allowed?(url) ⇒ Boolean

Returns:

  • (Boolean)


58
59
60
# File 'lib/exclusion.rb', line 58

def allowed?(url)
  !excluded?(url)
end

#excluded?(url) ⇒ Boolean

Check to see if the given url is matched by any rule in the file, and return it’s associated status

Returns:

  • (Boolean)


48
49
50
51
52
53
54
55
56
# File 'lib/exclusion.rb', line 48

def excluded?(url)
  url = safe_unescape(url)
  @skip_list.each do |entry|
    return entry.last if url.include? entry.first
    return entry.last if entry.first == NULL_MATCH
  end

  false
end