Class: SpiderInstance

Inherits:

Object

Object
SpiderInstance

show all

Defined in:: lib/spider/spider_instance.rb

Defined Under Namespace

Classes: HeaderSetter

Instance Method Summary collapse

#add_url_check(&block) ⇒ Object

Add a predicate that determines whether to continue down this URL’s path.
#allowable_url?(a_url, parsed_url) ⇒ Boolean

:nodoc:.
#allowed?(a_url, parsed_url) ⇒ Boolean

True if the robots.txt for that URL allows access to it.
#check_already_seen_with(cacher) ⇒ Object

The Web is a graph; to avoid cycles we store the nodes (URLs) already visited.
#clear_headers ⇒ Object

Reset the headers hash.
#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ Object

:nodoc:.
#do_callbacks(a_url, resp, prior_url) ⇒ Object

:nodoc:.
#generate_next_urls(a_url, resp) ⇒ Object

:nodoc:.
#get_page(parsed_url, &block) ⇒ Object

:nodoc:.
#headers ⇒ Object

Use like a hash: headers = ‘user_id=1;password=btrross3’.
#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ SpiderInstance constructor

:nodoc:.
#on(code, p = nil, &block) ⇒ Object

Add a response handler.
#raw_headers ⇒ Object

:nodoc:.
#raw_headers=(v) ⇒ Object

:nodoc:.
#remove_trailing_slash(s) ⇒ Object

:nodoc:.
#setup(p = nil, &block) ⇒ Object

Run before the HTTP request.
#start! ⇒ Object

:nodoc:.
#stop! ⇒ Object

:nodoc:.
#store_next_urls_with(a_store) ⇒ Object

The Web is a really, really, really big graph; as such, this list of nodes to visit grows really, really, really big.
#success_or_failure(code) ⇒ Object

:nodoc:.
#teardown(p = nil, &block) ⇒ Object

Run last, once for each page.

Constructor Details

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ `SpiderInstance`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 27

def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
  @url_checks  = []
  @cache       = :memory
  @callbacks   = {}
  @next_urls   = [next_urls]
  @seen        = seen
  @rules       = rules || RobotRules.new("Ruby Spider #{Spider::VERSION}")
  @robots_seen = robots_seen
  @headers     = {}
  @setup       = nil
  @teardown    = nil
  @interrupted = false
end

Instance Method Details

#add_url_check(&block) ⇒ `Object`

Add a predicate that determines whether to continue down this URL’s path. All predicates must be true in order for a URL to proceed.

Takes a block that takes a string and produces a boolean. For example, this will ensure that the URL starts with ‘cashcats.biz’:

add_url_check { |a_url| a_url =~ %r{^http://cashcats.biz.*}



48
49
50

# File 'lib/spider/spider_instance.rb', line 48

def add_url_check(&block)
  @url_checks << block
end

#allowable_url?(a_url, parsed_url) ⇒ `Boolean`

:nodoc:

Returns:

(Boolean)

# File 'lib/spider/spider_instance.rb', line 205

def allowable_url?(a_url, parsed_url) #:nodoc:
  !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
    @url_checks.map{|url_check|url_check.call(a_url)}.all?
end

#allowed?(a_url, parsed_url) ⇒ `Boolean`

True if the robots.txt for that URL allows access to it.

Returns:

(Boolean)

# File 'lib/spider/spider_instance.rb', line 211

def allowed?(a_url, parsed_url) # :nodoc:
  return false unless ['http','https'].include?(parsed_url.scheme)
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
  parsed_u = URI.parse(u)
  return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
  begin
    unless @robots_seen.include?(u)
      #open(u, 'User-Agent' => 'Ruby Spider',
      #  'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
      #  @rules.parse(u, url.read)
      #end
      get_page(parsed_u) do |r|
        @rules.parse(u, r.body)
      end
      @robots_seen << u
    end
    @rules.allowed?(a_url)
  rescue OpenURI::HTTPError
    true # No robots.txt
  rescue Exception, Timeout::Error # to keep it from crashing
    false
  end
end

#check_already_seen_with(cacher) ⇒ `Object`

The Web is a graph; to avoid cycles we store the nodes (URLs) already visited. The Web is a really, really, really big graph; as such, this list of visited nodes grows really, really, really big.

Change the object used to store these seen nodes with this. The default object is an instance of Array. Available with Spider is a wrapper of memcached.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just << and included? .

# default
check_already_seen_with Array.new

# memcached
require 'spider/included_in_memcached'
check_already_seen_with IncludedInMemcached.new('localhost:11211')

# File 'lib/spider/spider_instance.rb', line 69

def check_already_seen_with(cacher)
  if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
    @seen = cacher
  else
    raise ArgumentError, 'expected something that responds to << and included?'
  end
end

#clear_headers ⇒ `Object`

Reset the headers hash.



160
161
162

# File 'lib/spider/spider_instance.rb', line 160

def clear_headers
  @headers = {}
end

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 285

def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
  parsed_additional_url ||= URI.parse(additional_url)
  case parsed_additional_url.scheme
    when nil
      u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
      if additional_url[0].chr == '/'
        "#{u.scheme}://#{u.host}#{additional_url}"
      elsif u.path.nil? || u.path == ''
        "#{u.scheme}://#{u.host}/#{additional_url}"
      elsif u.path[0].chr == '/'
        "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
      else
        "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
      end
  else
    additional_url
  end
end

#do_callbacks(a_url, resp, prior_url) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 256

def do_callbacks(a_url, resp, prior_url) #:nodoc:
  cbs = [@callbacks[:every],
    resp.success? ?  @callbacks[:success] : @callbacks[:failure],
    @callbacks[resp.code]]

  cbs.each do |cb|
    cb.call(a_url, resp, prior_url) if cb
  end
end

#generate_next_urls(a_url, resp) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 266

def generate_next_urls(a_url, resp) #:nodoc:
  web_page = resp.body
  base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
              [a_url[0,a_url.rindex('/')]])[0]
  base_url = remove_trailing_slash(base_url)
  web_page.scan(/href="(.*?)"/i).flatten.map do |link|
    begin
      parsed_link = URI.parse(link)
      if parsed_link.fragment == '#'
        nil
      else
        construct_complete_url(base_url, link, parsed_link)
      end
    rescue
      nil
    end
  end.compact
end

#get_page(parsed_url, &block) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 235

def get_page(parsed_url, &block) #:nodoc:
  @seen << parsed_url
  begin
    http = Net::HTTP.new(parsed_url.host, parsed_url.port)
    if parsed_url.scheme == 'https'
      http.use_ssl = true
      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
    end
    # Uses start because http.finish cannot be called.
    r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
    if r.redirect?
      get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
    else
      block.call(r)
    end
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
    p e
    nil
  end
end

#headers ⇒ `Object`

Use like a hash:

headers['Cookies'] = 'user_id=1;password=btrross3'



148
149
150

# File 'lib/spider/spider_instance.rb', line 148

def headers
  HeaderSetter.new(self)
end

#on(code, p = nil, &block) ⇒ `Object`

Add a response handler. A response handler’s trigger can be :every, :success, :failure, or any HTTP status code. The handler itself can be either a Proc or a block.

The arguments to the block are: the URL as a string, an instance of Net::HTTPResponse, and the prior URL as a string.

For example:

on 404 do |a_url, resp, prior_url|
  puts "URL not found: #{a_url}"
end

on :success do |a_url, resp, prior_url|
  puts a_url
  puts resp.body
end

on :every do |a_url, resp, prior_url|
  puts "Given this code: #{resp.code}"
end

# File 'lib/spider/spider_instance.rb', line 123

def on(code, p = nil, &block)
  f = p ? p : block
  case code
  when Integer
    @callbacks[code] = f
  else
    @callbacks[code.to_sym] = f
  end
end

#raw_headers ⇒ `Object`

:nodoc:



152
153
154

# File 'lib/spider/spider_instance.rb', line 152

def raw_headers #:nodoc:
  @headers
end

#raw_headers=(v) ⇒ `Object`

:nodoc:



155
156
157

# File 'lib/spider/spider_instance.rb', line 155

def raw_headers=(v) #:nodoc:
  @headers = v
end

#remove_trailing_slash(s) ⇒ `Object`

:nodoc:



304
305
306

# File 'lib/spider/spider_instance.rb', line 304

def remove_trailing_slash(s) #:nodoc:
  s.sub(%r{/*$},'')
end

#setup(p = nil, &block) ⇒ `Object`

Run before the HTTP request. Given the URL as a string.

setup do |a_url|
  headers['Cookies'] = 'user_id=1;admin=true'
end



137
138
139

# File 'lib/spider/spider_instance.rb', line 137

def setup(p = nil, &block)
  @setup = p ? p : block
end

#start! ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 164

def start! #:nodoc:
  trap("SIGINT") { @interrupted = true }
  begin
    next_urls = @next_urls.pop
    tmp_n_u = {}
    next_urls.each do |prior_url, urls|
      urls = [urls] unless urls.kind_of?(Array)
      urls.map do |a_url|
        [a_url, (URI.parse(a_url) rescue nil)]
      end.select do |a_url, parsed_url|
        allowable_url?(a_url, parsed_url)
      end.each do |a_url, parsed_url|
        @setup.call(a_url) unless @setup.nil?
        get_page(parsed_url) do |response|
          do_callbacks(a_url, response, prior_url)
          #tmp_n_u[a_url] = generate_next_urls(a_url, response)
          #@next_urls.push tmp_n_u
          generate_next_urls(a_url, response).each do |a_next_url|
            @next_urls.push a_url => a_next_url
          end
          #exit if interrupted
        end
        @teardown.call(a_url) unless @teardown.nil?
        break if @interrupted
      end
    end
  end while !@next_urls.empty? && !@interrupted
end

#stop! ⇒ `Object`

:nodoc:



193
194
195

# File 'lib/spider/spider_instance.rb', line 193

def stop! #:nodoc:
  @interrupted = true
end

#store_next_urls_with(a_store) ⇒ `Object`

The Web is a really, really, really big graph; as such, this list of nodes to visit grows really, really, really big.

Change the object used to store nodes we have yet to walk. The default object is an instance of Array. Available with Spider is a wrapper of AmazonSQS.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just push and pop .

# default
store_next_urls_with Array.new

# AmazonSQS
require 'spider/next_urls_in_sqs'
store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)

# File 'lib/spider/spider_instance.rb', line 93

def store_next_urls_with(a_store)
  tmp_next_urls = @next_urls
  @next_urls = a_store
  tmp_next_urls.each do |a_url_hash|
    @next_urls.push a_url_hash
  end
end

#success_or_failure(code) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 197

def success_or_failure(code) #:nodoc:
  if code > 199 && code < 300
    :success
  else
    :failure
  end
end

#teardown(p = nil, &block) ⇒ `Object`

Run last, once for each page. Given the URL as a string.



142
143
144

# File 'lib/spider/spider_instance.rb', line 142

def teardown(p = nil, &block)
  @teardown = p ? p : block
end

Class: SpiderInstance

Defined Under Namespace

Instance Method Summary collapse

Constructor Details

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ SpiderInstance

Instance Method Details

#add_url_check(&block) ⇒ Object

#allowable_url?(a_url, parsed_url) ⇒ Boolean

#allowed?(a_url, parsed_url) ⇒ Boolean

#check_already_seen_with(cacher) ⇒ Object

#clear_headers ⇒ Object

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ Object

#do_callbacks(a_url, resp, prior_url) ⇒ Object

#generate_next_urls(a_url, resp) ⇒ Object

#get_page(parsed_url, &block) ⇒ Object

#headers ⇒ Object

#on(code, p = nil, &block) ⇒ Object

#raw_headers ⇒ Object

#raw_headers=(v) ⇒ Object

#remove_trailing_slash(s) ⇒ Object

#setup(p = nil, &block) ⇒ Object

#start! ⇒ Object

#stop! ⇒ Object

#store_next_urls_with(a_store) ⇒ Object

#success_or_failure(code) ⇒ Object

#teardown(p = nil, &block) ⇒ Object

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ `SpiderInstance`

#add_url_check(&block) ⇒ `Object`

#allowable_url?(a_url, parsed_url) ⇒ `Boolean`

#allowed?(a_url, parsed_url) ⇒ `Boolean`

#check_already_seen_with(cacher) ⇒ `Object`

#clear_headers ⇒ `Object`

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ `Object`

#do_callbacks(a_url, resp, prior_url) ⇒ `Object`

#generate_next_urls(a_url, resp) ⇒ `Object`

#get_page(parsed_url, &block) ⇒ `Object`

#headers ⇒ `Object`

#on(code, p = nil, &block) ⇒ `Object`

#raw_headers ⇒ `Object`

#raw_headers=(v) ⇒ `Object`

#remove_trailing_slash(s) ⇒ `Object`

#setup(p = nil, &block) ⇒ `Object`

#start! ⇒ `Object`

#stop! ⇒ `Object`

#store_next_urls_with(a_store) ⇒ `Object`

#success_or_failure(code) ⇒ `Object`

#teardown(p = nil, &block) ⇒ `Object`