Class: SpiderInstance

Inherits:

Object

Object
SpiderInstance

show all

Defined in:: lib/spider/spider_instance.rb

Defined Under Namespace

Classes: HeaderSetter

Instance Method Summary collapse

#add_url_check(&block) ⇒ Object

Add a predicate that determines whether to continue down this URL’s path.
#allowable_url?(a_url, parsed_url) ⇒ Boolean

:nodoc:.
#allowed?(a_url, parsed_url) ⇒ Boolean

True if the robots.txt for that URL allows access to it.
#check_already_seen_with(cacher) ⇒ Object

The Web is a graph; to avoid cycles we store the nodes (URLs) already visited.
#clear_headers ⇒ Object

Reset the headers hash.
#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ Object

:nodoc:.
#do_callbacks(a_url, resp, prior_url) ⇒ Object

:nodoc:.
#generate_next_urls(a_url, resp) ⇒ Object

:nodoc:.
#get_page(parsed_url, &block) ⇒ Object

:nodoc:.
#headers ⇒ Object

Use like a hash: headers = ‘user_id=1;password=btrross3’.
#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ SpiderInstance constructor

:nodoc:.
#on(code, p = nil, &block) ⇒ Object

Add a response handler.
#raw_headers ⇒ Object

:nodoc:.
#raw_headers=(v) ⇒ Object

:nodoc:.
#remove_trailing_slash(s) ⇒ Object

:nodoc:.
#setup(p = nil, &block) ⇒ Object

Run before the HTTP request.
#start! ⇒ Object

:nodoc:.
#store_next_urls_with(a_store) ⇒ Object

The Web is a really, really, really big graph; as such, this list of nodes to visit grows really, really, really big.
#success_or_failure(code) ⇒ Object

:nodoc:.
#teardown(p = nil, &block) ⇒ Object

Run last, once for each page.

Constructor Details

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ `SpiderInstance`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 27

def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
  @url_checks  = []
  @cache       = :memory
  @callbacks   = {}
  @next_urls   = [next_urls]
  @seen        = seen
  @rules       = rules || RobotRules.new("Ruby Spider #{Spider::VERSION}")
  @robots_seen = robots_seen
  @headers     = {}
  @setup       = nil
  @teardown    = nil
end

Instance Method Details

#add_url_check(&block) ⇒ `Object`

Add a predicate that determines whether to continue down this URL’s path. All predicates must be true in order for a URL to proceed.

Takes a block that takes a string and produces a boolean. For example, this will ensure that the URL starts with ‘cashcats.biz’:

add_url_check { |a_url| a_url =~ %r{^http://cashcats.biz.*}



47
48
49

# File 'lib/spider/spider_instance.rb', line 47

def add_url_check(&block)
  @url_checks << block
end

#allowable_url?(a_url, parsed_url) ⇒ `Boolean`

:nodoc:

Returns:

(Boolean)

# File 'lib/spider/spider_instance.rb', line 201

def allowable_url?(a_url, parsed_url) #:nodoc:
  !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
    @url_checks.map{|url_check|url_check.call(a_url)}.all?
end

#allowed?(a_url, parsed_url) ⇒ `Boolean`

True if the robots.txt for that URL allows access to it.

Returns:

(Boolean)

# File 'lib/spider/spider_instance.rb', line 207

def allowed?(a_url, parsed_url) # :nodoc:
  return false unless ['http','https'].include?(parsed_url.scheme)
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
  parsed_u = URI.parse(u)
  return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
  begin
    unless @robots_seen.include?(u)
      #open(u, 'User-Agent' => 'Ruby Spider',
      #  'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
      #  @rules.parse(u, url.read)
      #end
      get_page(parsed_u) do |r|
        @rules.parse(u, r.body)
      end
      @robots_seen << u
    end
    @rules.allowed?(a_url)
  rescue OpenURI::HTTPError
    true # No robots.txt
  rescue Exception, Timeout::Error # to keep it from crashing
    false
  end
end

#check_already_seen_with(cacher) ⇒ `Object`

The Web is a graph; to avoid cycles we store the nodes (URLs) already visited. The Web is a really, really, really big graph; as such, this list of visited nodes grows really, really, really big.

Change the object used to store these seen nodes with this. The default object is an instance of Array. Available with Spider is a wrapper of memcached.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just << and included? .

# default
check_already_seen_with Array.new

# memcached
require 'spider/included_in_memcached'
check_already_seen_with IncludedInMemcached.new('localhost:11211')

# File 'lib/spider/spider_instance.rb', line 68

def check_already_seen_with(cacher)
  if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
    @seen = cacher
  else
    raise ArgumentError, 'expected something that responds to << and included?'
  end
end

#clear_headers ⇒ `Object`

Reset the headers hash.



159
160
161

# File 'lib/spider/spider_instance.rb', line 159

def clear_headers
  @headers = {}
end

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 281

def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
  parsed_additional_url ||= URI.parse(additional_url)
  case parsed_additional_url.scheme
    when nil
      u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
      if additional_url[0].chr == '/'
        "#{u.scheme}://#{u.host}#{additional_url}"
      elsif u.path.nil? || u.path == ''
        "#{u.scheme}://#{u.host}/#{additional_url}"
      elsif u.path[0].chr == '/'
        "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
      else
        "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
      end
  else
    additional_url
  end
end

#do_callbacks(a_url, resp, prior_url) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 252

def do_callbacks(a_url, resp, prior_url) #:nodoc:
  cbs = [@callbacks[:every],
    resp.success? ?  @callbacks[:success] : @callbacks[:failure],
    @callbacks[resp.code]]

  cbs.each do |cb|
    cb.call(a_url, resp, prior_url) if cb
  end
end

#generate_next_urls(a_url, resp) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 262

def generate_next_urls(a_url, resp) #:nodoc:
  web_page = resp.body
  base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
              [a_url[0,a_url.rindex('/')]])[0]
  base_url = remove_trailing_slash(base_url)
  web_page.scan(/href="(.*?)"/i).flatten.map do |link|
    begin
      parsed_link = URI.parse(link)
      if parsed_link.fragment == '#'
        nil
      else
        construct_complete_url(base_url, link, parsed_link)
      end
    rescue
      nil
    end
  end.compact
end

#get_page(parsed_url, &block) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 231

def get_page(parsed_url, &block) #:nodoc:
  @seen << parsed_url
  begin
    http = Net::HTTP.new(parsed_url.host, parsed_url.port)
    if parsed_url.scheme == 'https'
      http.use_ssl = true
      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
    end
    # Uses start because http.finish cannot be called.
    r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
    if r.redirect?
      get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
    else
      block.call(r)
    end
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
    p e
    nil
  end
end

#headers ⇒ `Object`

Use like a hash:

headers['Cookies'] = 'user_id=1;password=btrross3'



147
148
149

# File 'lib/spider/spider_instance.rb', line 147

def headers
  HeaderSetter.new(self)
end

#on(code, p = nil, &block) ⇒ `Object`

Add a response handler. A response handler’s trigger can be :every, :success, :failure, or any HTTP status code. The handler itself can be either a Proc or a block.

The arguments to the block are: the URL as a string, an instance of Net::HTTPResponse, and the prior URL as a string.

For example:

on 404 do |a_url, resp, prior_url|
  puts "URL not found: #{a_url}"
end

on :success do |a_url, resp, prior_url|
  puts a_url
  puts resp.body
end

on :every do |a_url, resp, prior_url|
  puts "Given this code: #{resp.code}"
end

# File 'lib/spider/spider_instance.rb', line 122

def on(code, p = nil, &block)
  f = p ? p : block
  case code
  when Fixnum
    @callbacks[code] = f
  else
    @callbacks[code.to_sym] = f
  end
end

#raw_headers ⇒ `Object`

:nodoc:



151
152
153

# File 'lib/spider/spider_instance.rb', line 151

def raw_headers #:nodoc:
  @headers
end

#raw_headers=(v) ⇒ `Object`

:nodoc:



154
155
156

# File 'lib/spider/spider_instance.rb', line 154

def raw_headers=(v) #:nodoc:
  @headers = v
end

#remove_trailing_slash(s) ⇒ `Object`

:nodoc:



300
301
302

# File 'lib/spider/spider_instance.rb', line 300

def remove_trailing_slash(s) #:nodoc:
  s.sub(%r{/*$},'')
end

#setup(p = nil, &block) ⇒ `Object`

Run before the HTTP request. Given the URL as a string.

setup do |a_url|
  headers['Cookies'] = 'user_id=1;admin=true'
end



136
137
138

# File 'lib/spider/spider_instance.rb', line 136

def setup(p = nil, &block)
  @setup = p ? p : block
end

#start! ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 163

def start! #:nodoc:
  interrupted = false
  trap("SIGINT") { interrupted = true }
  begin
    next_urls = @next_urls.pop
    tmp_n_u = {}
    next_urls.each do |prior_url, urls|
      urls = [urls] unless urls.kind_of?(Array)
      urls.map do |a_url|
        [a_url, (URI.parse(a_url) rescue nil)]
      end.select do |a_url, parsed_url|
        allowable_url?(a_url, parsed_url)
      end.each do |a_url, parsed_url|
        @setup.call(a_url) unless @setup.nil?
        get_page(parsed_url) do |response|
          do_callbacks(a_url, response, prior_url)
          #tmp_n_u[a_url] = generate_next_urls(a_url, response)
          #@next_urls.push tmp_n_u
          generate_next_urls(a_url, response).each do |a_next_url|
            @next_urls.push a_url => a_next_url
          end
          #exit if interrupted
        end
        @teardown.call(a_url) unless @teardown.nil?
        exit if interrupted
      end
    end
  end while !@next_urls.empty?
end

#store_next_urls_with(a_store) ⇒ `Object`

The Web is a really, really, really big graph; as such, this list of nodes to visit grows really, really, really big.

Change the object used to store nodes we have yet to walk. The default object is an instance of Array. Available with Spider is a wrapper of AmazonSQS.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just push and pop .

# default
store_next_urls_with Array.new

# AmazonSQS
require 'spider/next_urls_in_sqs'
store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)

# File 'lib/spider/spider_instance.rb', line 92

def store_next_urls_with(a_store)
  tmp_next_urls = @next_urls
  @next_urls = a_store
  tmp_next_urls.each do |a_url_hash|
    @next_urls.push a_url_hash
  end
end

#success_or_failure(code) ⇒ `Object`

:nodoc:

# File 'lib/spider/spider_instance.rb', line 193

def success_or_failure(code) #:nodoc:
  if code > 199 && code < 300
    :success
  else
    :failure
  end
end

#teardown(p = nil, &block) ⇒ `Object`

Run last, once for each page. Given the URL as a string.



141
142
143

# File 'lib/spider/spider_instance.rb', line 141

def teardown(p = nil, &block)
  @teardown = p ? p : block
end

Class: SpiderInstance

Defined Under Namespace

Instance Method Summary collapse

Constructor Details

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ SpiderInstance

Instance Method Details

#add_url_check(&block) ⇒ Object

#allowable_url?(a_url, parsed_url) ⇒ Boolean

#allowed?(a_url, parsed_url) ⇒ Boolean

#check_already_seen_with(cacher) ⇒ Object

#clear_headers ⇒ Object

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ Object

#do_callbacks(a_url, resp, prior_url) ⇒ Object

#generate_next_urls(a_url, resp) ⇒ Object

#get_page(parsed_url, &block) ⇒ Object

#headers ⇒ Object

#on(code, p = nil, &block) ⇒ Object

#raw_headers ⇒ Object

#raw_headers=(v) ⇒ Object

#remove_trailing_slash(s) ⇒ Object

#setup(p = nil, &block) ⇒ Object

#start! ⇒ Object

#store_next_urls_with(a_store) ⇒ Object

#success_or_failure(code) ⇒ Object

#teardown(p = nil, &block) ⇒ Object

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ `SpiderInstance`

#add_url_check(&block) ⇒ `Object`

#allowable_url?(a_url, parsed_url) ⇒ `Boolean`

#allowed?(a_url, parsed_url) ⇒ `Boolean`

#check_already_seen_with(cacher) ⇒ `Object`

#clear_headers ⇒ `Object`

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ `Object`

#do_callbacks(a_url, resp, prior_url) ⇒ `Object`

#generate_next_urls(a_url, resp) ⇒ `Object`

#get_page(parsed_url, &block) ⇒ `Object`

#headers ⇒ `Object`

#on(code, p = nil, &block) ⇒ `Object`

#raw_headers ⇒ `Object`

#raw_headers=(v) ⇒ `Object`

#remove_trailing_slash(s) ⇒ `Object`

#setup(p = nil, &block) ⇒ `Object`

#start! ⇒ `Object`

#store_next_urls_with(a_store) ⇒ `Object`

#success_or_failure(code) ⇒ `Object`

#teardown(p = nil, &block) ⇒ `Object`