Class: Spider::ExclusionParser

Inherits:
Object
  • Object
show all
Defined in:
lib/exclusion.rb

Constant Summary collapse

DISALLOW =
"disallow"
DELAY =
"crawl-delay"
ALLOW =
"allow"
MAX_DIRECTIVES =
1000
NULL_MATCH =
"*!*"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, agent = nil) ⇒ ExclusionParser

Returns a new instance of ExclusionParser.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/exclusion.rb', line 28

def initialize(text, agent=nil)
  @skip_list = []
  @agent_key = agent
  
  return if text.nil? || text.length.zero?
  
  if [401, 403].include? text.http_status
    @skip_list << [NULL_MATCH, true]
    return
  end
  
  begin
    config = parse_text(text)
    grab_list(config)
  rescue
  end
end

Instance Attribute Details

#wait_timeObject

Returns the value of attribute wait_time.



19
20
21
# File 'lib/exclusion.rb', line 19

def wait_time
  @wait_time
end

Instance Method Details

#allowed?(url) ⇒ Boolean

Returns:

  • (Boolean)


59
60
61
# File 'lib/exclusion.rb', line 59

def allowed?(url)
  !excluded?(url)
end

#excluded?(url) ⇒ Boolean

Check to see if the given url is matched by any rule in the file, and return it’s associated status

Returns:

  • (Boolean)


49
50
51
52
53
54
55
56
57
# File 'lib/exclusion.rb', line 49

def excluded?(url)
  url = safe_unescape(url)
  @skip_list.each do |entry|
    return entry.last if url.include? entry.first
    return entry.last if entry.first == NULL_MATCH
  end
  
  false
end