Class: WebRobots

Inherits:
Object
  • Object
show all
Defined in:
lib/webrobots.rb,
lib/webrobots/robotstxt.rb

Defined Under Namespace

Classes: Error, ParseError, RobotsTxt

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(user_agent, options = nil) ⇒ WebRobots

Creates a WebRobots object for a robot named user_agent, with optional options.

  • :http_get => a custom method, proc, or anything that responds to .call(uri), to be used for fetching robots.txt. It must return the response body if successful, return an empty string if the resource is not found, and return nil or raise any error on failure. Redirects should be handled within this proc.



20
21
22
23
24
25
26
27
28
29
# File 'lib/webrobots.rb', line 20

def initialize(user_agent, options = nil)
  @user_agent = user_agent
  @parser = RobotsTxt::Parser.new(user_agent)
  @parser_mutex = Mutex.new

  options ||= {}
  @http_get = options[:http_get] || method(:http_get)

  @robotstxt = create_cache()
end

Instance Attribute Details

#user_agentObject (readonly)

Returns the robot name initially given.



42
43
44
# File 'lib/webrobots.rb', line 42

def user_agent
  @user_agent
end

Instance Method Details

#allowed?(url) ⇒ Boolean

Tests if the robot is allowed to access a resource at url. If a malformed URI string is given, URI::InvalidURIError is raised. If a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is raised.

Returns:

  • (Boolean)


48
49
50
51
52
53
# File 'lib/webrobots.rb', line 48

def allowed?(url)
  site, request_uri = split_uri(url)
  return true if request_uri == '/robots.txt'
  robots_txt = get_robots_txt(site)
  robots_txt.allow?(request_uri)
end

#create_cacheObject

:nodoc:



32
33
34
# File 'lib/webrobots.rb', line 32

def create_cache
  Hash.new	# Must respond to [], []=, delete and clear.
end

#disallowed?(url) ⇒ Boolean

Equivalent to !allowed?(url).

Returns:

  • (Boolean)


56
57
58
# File 'lib/webrobots.rb', line 56

def disallowed?(url)
  !allowed?(url)
end

#error(url) ⇒ Object

Returns an error object if there is an error in fetching or parsing robots.txt of the site url.



80
81
82
# File 'lib/webrobots.rb', line 80

def error(url)
  robots_txt_for(url).error
end

#error!(url) ⇒ Object

Raises the error if there was an error in fetching or parsing robots.txt of the site url.



86
87
88
# File 'lib/webrobots.rb', line 86

def error!(url)
  robots_txt_for(url).error!
end

#flush_cacheObject

Flushes robots.txt cache.



37
38
39
# File 'lib/webrobots.rb', line 37

def flush_cache
  @robotstxt.clear
end

#option(url, token) ⇒ Object

Equivalent to option(url).



68
69
70
# File 'lib/webrobots.rb', line 68

def option(url, token)
  options(url)[token.downcase]
end

#options(url) ⇒ Object

Returns extended option values for a resource at url in a hash with each field name lower-cased. See allowed?() for a list of errors that may be raised.



63
64
65
# File 'lib/webrobots.rb', line 63

def options(url)
  robots_txt_for(url).options
end

#reset(url) ⇒ Object

Removes robots.txt cache for the site url.



91
92
93
94
# File 'lib/webrobots.rb', line 91

def reset(url)
  site, = split_uri(url)
  @robotstxt.delete(site)
end

#sitemaps(url) ⇒ Object

Returns an array of Sitemap URLs. See allowed?() for a list of errors that may be raised.



74
75
76
# File 'lib/webrobots.rb', line 74

def sitemaps(url)
  robots_txt_for(url).sitemaps
end