Class: Polipus::Robotex

Inherits:
Object
  • Object
show all
Defined in:
lib/polipus/robotex.rb

Overview

Defined Under Namespace

Classes: ParsedRobots

Constant Summary collapse

DEFAULT_TIMEOUT =
3
VERSION =
'1.0.0'

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(user_agent = nil) ⇒ Robotex

Returns a new instance of Robotex.



119
120
121
122
123
124
# File 'lib/polipus/robotex.rb', line 119

def initialize(user_agent = nil)
  user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
  @user_agent = user_agent
  @last_accessed = Time.at(1)
  @parsed = {}
end

Instance Attribute Details

#user_agentObject (readonly)

Returns the value of attribute user_agent.



14
15
16
# File 'lib/polipus/robotex.rb', line 14

def user_agent
  @user_agent
end

Class Method Details

.get_robots_txt(uri, user_agent) ⇒ Object



101
102
103
104
105
106
107
108
109
# File 'lib/polipus/robotex.rb', line 101

def self.get_robots_txt(uri, user_agent)
  begin
    Timeout::timeout(Robotex.timeout) do
      URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
    end 
  rescue Timeout::Error
    STDERR.puts "robots.txt request timed out"
  end
end

.timeoutObject



115
116
117
# File 'lib/polipus/robotex.rb', line 115

def self.timeout
  @timeout || DEFAULT_TIMEOUT
end

.timeout=(t) ⇒ Object



111
112
113
# File 'lib/polipus/robotex.rb', line 111

def self.timeout=(t)
  @timeout = t
end

Instance Method Details

#allowed?(uri) ⇒ Boolean

Download the server’s robots.txt, and return try if we are allowed to acces the url, false otherwise

Returns:

  • (Boolean)


134
135
136
# File 'lib/polipus/robotex.rb', line 134

def allowed?(uri)
  parse_host(uri).allowed?(uri, @user_agent)
end

#delay(uri) ⇒ Object

Return the value of the Crawl-Delay directive, or nil if none



140
141
142
# File 'lib/polipus/robotex.rb', line 140

def delay(uri)
  parse_host(uri).delay(@user_agent)
end

#delay!(uri) ⇒ Object

Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server



147
148
149
150
151
# File 'lib/polipus/robotex.rb', line 147

def delay!(uri)
  delay = delay(uri)
  sleep delay - (Time.now - @last_accessed) if !!delay
  @last_accessed = Time.now
end

#parse_host(uri) ⇒ Object



126
127
128
129
# File 'lib/polipus/robotex.rb', line 126

def parse_host(uri)
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
  @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
end