Class: Polipus::Robotex

Inherits:
Object
  • Object
show all
Defined in:
lib/polipus/robotex.rb

Overview

Defined Under Namespace

Classes: ParsedRobots

Constant Summary collapse

DEFAULT_TIMEOUT =
3
VERSION =
'1.0.0'

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(user_agent = nil) ⇒ Robotex

Returns a new instance of Robotex.



111
112
113
114
115
116
# File 'lib/polipus/robotex.rb', line 111

def initialize(user_agent = nil)
  user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
  @user_agent = user_agent
  @last_accessed = Time.at(1)
  @parsed = {}
end

Class Attribute Details

.timeoutObject



107
108
109
# File 'lib/polipus/robotex.rb', line 107

def self.timeout
  @timeout || DEFAULT_TIMEOUT
end

Instance Attribute Details

#user_agentObject (readonly)

Returns the value of attribute user_agent.



13
14
15
# File 'lib/polipus/robotex.rb', line 13

def user_agent
  @user_agent
end

Class Method Details

.get_robots_txt(uri, user_agent) ⇒ Object



95
96
97
98
99
100
101
# File 'lib/polipus/robotex.rb', line 95

def self.get_robots_txt(uri, user_agent)
  Timeout.timeout(Robotex.timeout) do
    URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
  end
rescue Timeout::Error
  STDERR.puts 'robots.txt request timed out'
end

Instance Method Details

#allowed?(uri) ⇒ Boolean

Download the server’s robots.txt, and return try if we are allowed to acces the url, false otherwise

Returns:

  • (Boolean)


126
127
128
# File 'lib/polipus/robotex.rb', line 126

def allowed?(uri)
  parse_host(uri).allowed?(uri, @user_agent)
end

#delay(uri) ⇒ Object

Return the value of the Crawl-Delay directive, or nil if none



132
133
134
# File 'lib/polipus/robotex.rb', line 132

def delay(uri)
  parse_host(uri).delay(@user_agent)
end

#delay!(uri) ⇒ Object

Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server



139
140
141
142
143
# File 'lib/polipus/robotex.rb', line 139

def delay!(uri)
  delay = delay(uri)
  sleep delay - (Time.now - @last_accessed) if delay
  @last_accessed = Time.now
end

#parse_host(uri) ⇒ Object



118
119
120
121
# File 'lib/polipus/robotex.rb', line 118

def parse_host(uri)
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
  @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
end