Class: WebRobots

Inherits:

Object

Object
WebRobots

show all

Defined in:: lib/webrobots.rb,
lib/webrobots/robotstxt.rb

Defined Under Namespace

Classes: Error, ParseError, RobotsTxt

Instance Attribute Summary collapse

#user_agent ⇒ Object readonly

Returns the robot name initially given.

Instance Method Summary collapse

#allowed?(url) ⇒ Boolean

Tests if the robot is allowed to access a resource at url.
#crawl_delay(url) ⇒ Object

Returns the number of seconds that the configured agent should wait between successive requests to the site identified by url according to the site’s robots.txt Crawl-delay directive.
#create_cache ⇒ Object

:nodoc:.
#disallowed?(url) ⇒ Boolean

Equivalent to !allowed?(url).
#error(url) ⇒ Object

Returns an error object if there is an error in fetching or parsing robots.txt of the site url.
#error!(url) ⇒ Object

Raises the error if there was an error in fetching or parsing robots.txt of the site url.
#flush_cache ⇒ Object

Flushes robots.txt cache.
#initialize(user_agent, options = nil) ⇒ WebRobots constructor

Creates a WebRobots object for a robot named user_agent, with optional options.
#option(url, token) ⇒ Object

Equivalent to option(url).
#options(url) ⇒ Object

Returns extended option values for a resource at url in a hash with each field name lower-cased.
#reset(url) ⇒ Object

Removes robots.txt cache for the site url.
#sitemaps(url) ⇒ Object

Returns an array of Sitemap URLs.

Constructor Details

#initialize(user_agent, options = nil) ⇒ `WebRobots`

Creates a WebRobots object for a robot named user_agent, with optional options.

:http_get => a custom method, proc, or anything that responds to .call(uri), to be used for fetching robots.txt. It must return the response body if successful, return an empty string if the resource is not found, and return nil or raise any error on failure. Redirects should be handled within this proc.
:crawl_delay => determines how to react to Crawl-delay directives. If :sleep is given, WebRobots sleeps as demanded when allowed?(url)/disallowed?(url) is called. This is the default behavior. If :ignore is given, WebRobots does nothing. If a custom method, proc, or anything that responds to .call(delay, last_checked_at), it is called.

# File 'lib/webrobots.rb', line 28

def initialize(user_agent, options = nil)
  @user_agent = user_agent

  options ||= {}
  @http_get = options[:http_get] || method(:http_get)
  crawl_delay_handler =
    case value = options[:crawl_delay] || :sleep
    when :ignore
      nil
    when :sleep
      method(:crawl_delay_handler)
    else
      if value.respond_to?(:call)
        value
      else
        raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}"
      end
    end

  @parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler)
  @parser_mutex = Mutex.new

  @robotstxt = create_cache()
end

Instance Attribute Details

#user_agent ⇒ `Object` (readonly)

Returns the robot name initially given.



64
65
66

# File 'lib/webrobots.rb', line 64

def user_agent
  @user_agent
end

Instance Method Details

#allowed?(url) ⇒ `Boolean`

Tests if the robot is allowed to access a resource at url. If a malformed URI string is given, URI::InvalidURIError is raised. If a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is raised.

Returns:

(Boolean)

# File 'lib/webrobots.rb', line 70

def allowed?(url)
  site, request_uri = split_uri(url)
  return true if request_uri == '/robots.txt'
  robots_txt = get_robots_txt(site)
  robots_txt.allow?(request_uri)
end

#crawl_delay(url) ⇒ `Object`

Returns the number of seconds that the configured agent should wait between successive requests to the site identified by url according to the site’s robots.txt Crawl-delay directive.



85
86
87

# File 'lib/webrobots.rb', line 85

def crawl_delay(url)
  robots_txt_for(url).crawl_delay()
end

#create_cache ⇒ `Object`

:nodoc:



54
55
56

# File 'lib/webrobots.rb', line 54

def create_cache
  Hash.new	# Must respond to [], []=, delete and clear.
end

#disallowed?(url) ⇒ `Boolean`

Equivalent to !allowed?(url).

Returns:

(Boolean)



78
79
80

# File 'lib/webrobots.rb', line 78

def disallowed?(url)
  !allowed?(url)
end

#error(url) ⇒ `Object`

Returns an error object if there is an error in fetching or parsing robots.txt of the site url.



109
110
111

# File 'lib/webrobots.rb', line 109

def error(url)
  robots_txt_for(url).error
end

#error!(url) ⇒ `Object`

Raises the error if there was an error in fetching or parsing robots.txt of the site url.



115
116
117

# File 'lib/webrobots.rb', line 115

def error!(url)
  robots_txt_for(url).error!
end

#flush_cache ⇒ `Object`

Flushes robots.txt cache.



59
60
61

# File 'lib/webrobots.rb', line 59

def flush_cache
  @robotstxt.clear
end

#option(url, token) ⇒ `Object`

Equivalent to option(url).



97
98
99

# File 'lib/webrobots.rb', line 97

def option(url, token)
  options(url)[token.downcase]
end

#options(url) ⇒ `Object`

Returns extended option values for a resource at url in a hash with each field name lower-cased. See allowed?() for a list of errors that may be raised.



92
93
94

# File 'lib/webrobots.rb', line 92

def options(url)
  robots_txt_for(url).options
end

#reset(url) ⇒ `Object`

Removes robots.txt cache for the site url.

# File 'lib/webrobots.rb', line 120

def reset(url)
  site, = split_uri(url)
  @robotstxt.delete(site)
end

#sitemaps(url) ⇒ `Object`

Returns an array of Sitemap URLs. See allowed?() for a list of errors that may be raised.



103
104
105

# File 'lib/webrobots.rb', line 103

def sitemaps(url)
  robots_txt_for(url).sitemaps
end

Class: WebRobots

Defined Under Namespace

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(user_agent, options = nil) ⇒ WebRobots

Instance Attribute Details

#user_agent ⇒ Object (readonly)

Instance Method Details

#allowed?(url) ⇒ Boolean

#crawl_delay(url) ⇒ Object

#create_cache ⇒ Object

#disallowed?(url) ⇒ Boolean

#error(url) ⇒ Object

#error!(url) ⇒ Object

#flush_cache ⇒ Object

#option(url, token) ⇒ Object

#options(url) ⇒ Object

#reset(url) ⇒ Object

#sitemaps(url) ⇒ Object