Class: SpiderInstance

Inherits:
Object
  • Object
show all
Defined in:
lib/spider/spider_instance.rb

Defined Under Namespace

Classes: HeaderSetter

Instance Method Summary collapse

Constructor Details

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ SpiderInstance

:nodoc:



27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/spider/spider_instance.rb', line 27

def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
  @url_checks  = []
  @cache       = :memory
  @callbacks   = {}
  @next_urls   = [next_urls]
  @seen        = seen
  @rules       = rules || RobotRules.new("Ruby Spider #{Spider::VERSION}")
  @robots_seen = robots_seen
  @headers     = {}
  @setup       = nil
  @teardown    = nil
  @interrupted = false
end

Instance Method Details

#add_url_check(&block) ⇒ Object

Add a predicate that determines whether to continue down this URL’s path. All predicates must be true in order for a URL to proceed.

Takes a block that takes a string and produces a boolean. For example, this will ensure that the URL starts with ‘cashcats.biz’:

add_url_check { |a_url| a_url =~ %r{^http://cashcats.biz.*}


48
49
50
# File 'lib/spider/spider_instance.rb', line 48

def add_url_check(&block)
  @url_checks << block
end

#allowable_url?(a_url, parsed_url) ⇒ Boolean

:nodoc:

Returns:

  • (Boolean)


205
206
207
208
# File 'lib/spider/spider_instance.rb', line 205

def allowable_url?(a_url, parsed_url) #:nodoc:
  !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
    @url_checks.map{|url_check|url_check.call(a_url)}.all?
end

#allowed?(a_url, parsed_url) ⇒ Boolean

True if the robots.txt for that URL allows access to it.

Returns:

  • (Boolean)


211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/spider/spider_instance.rb', line 211

def allowed?(a_url, parsed_url) # :nodoc:
  return false unless ['http','https'].include?(parsed_url.scheme)
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
  parsed_u = URI.parse(u)
  return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
  begin
    unless @robots_seen.include?(u)
      #open(u, 'User-Agent' => 'Ruby Spider',
      #  'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
      #  @rules.parse(u, url.read)
      #end
      get_page(parsed_u) do |r|
        @rules.parse(u, r.body)
      end
      @robots_seen << u
    end
    @rules.allowed?(a_url)
  rescue OpenURI::HTTPError
    true # No robots.txt
  rescue Exception, Timeout::Error # to keep it from crashing
    false
  end
end

#check_already_seen_with(cacher) ⇒ Object

The Web is a graph; to avoid cycles we store the nodes (URLs) already visited. The Web is a really, really, really big graph; as such, this list of visited nodes grows really, really, really big.

Change the object used to store these seen nodes with this. The default object is an instance of Array. Available with Spider is a wrapper of memcached.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just << and included? .

# default
check_already_seen_with Array.new

# memcached
require 'spider/included_in_memcached'
check_already_seen_with IncludedInMemcached.new('localhost:11211')


69
70
71
72
73
74
75
# File 'lib/spider/spider_instance.rb', line 69

def check_already_seen_with(cacher)
  if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
    @seen = cacher
  else
    raise ArgumentError, 'expected something that responds to << and included?'
  end
end

#clear_headersObject

Reset the headers hash.



160
161
162
# File 'lib/spider/spider_instance.rb', line 160

def clear_headers
  @headers = {}
end

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ Object

:nodoc:



285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'lib/spider/spider_instance.rb', line 285

def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
  parsed_additional_url ||= URI.parse(additional_url)
  case parsed_additional_url.scheme
    when nil
      u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
      if additional_url[0].chr == '/'
        "#{u.scheme}://#{u.host}#{additional_url}"
      elsif u.path.nil? || u.path == ''
        "#{u.scheme}://#{u.host}/#{additional_url}"
      elsif u.path[0].chr == '/'
        "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
      else
        "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
      end
  else
    additional_url
  end
end

#do_callbacks(a_url, resp, prior_url) ⇒ Object

:nodoc:



256
257
258
259
260
261
262
263
264
# File 'lib/spider/spider_instance.rb', line 256

def do_callbacks(a_url, resp, prior_url) #:nodoc:
  cbs = [@callbacks[:every],
    resp.success? ?  @callbacks[:success] : @callbacks[:failure],
    @callbacks[resp.code]]

  cbs.each do |cb|
    cb.call(a_url, resp, prior_url) if cb
  end
end

#generate_next_urls(a_url, resp) ⇒ Object

:nodoc:



266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# File 'lib/spider/spider_instance.rb', line 266

def generate_next_urls(a_url, resp) #:nodoc:
  web_page = resp.body
  base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
              [a_url[0,a_url.rindex('/')]])[0]
  base_url = remove_trailing_slash(base_url)
  web_page.scan(/href="(.*?)"/i).flatten.map do |link|
    begin
      parsed_link = URI.parse(link)
      if parsed_link.fragment == '#'
        nil
      else
        construct_complete_url(base_url, link, parsed_link)
      end
    rescue
      nil
    end
  end.compact
end

#get_page(parsed_url, &block) ⇒ Object

:nodoc:



235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/spider/spider_instance.rb', line 235

def get_page(parsed_url, &block) #:nodoc:
  @seen << parsed_url
  begin
    http = Net::HTTP.new(parsed_url.host, parsed_url.port)
    if parsed_url.scheme == 'https'
      http.use_ssl = true
      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
    end
    # Uses start because http.finish cannot be called.
    r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
    if r.redirect?
      get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
    else
      block.call(r)
    end
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
    p e
    nil
  end
end

#headersObject

Use like a hash:

headers['Cookies'] = 'user_id=1;password=btrross3'


148
149
150
# File 'lib/spider/spider_instance.rb', line 148

def headers
  HeaderSetter.new(self)
end

#on(code, p = nil, &block) ⇒ Object

Add a response handler. A response handler’s trigger can be :every, :success, :failure, or any HTTP status code. The handler itself can be either a Proc or a block.

The arguments to the block are: the URL as a string, an instance of Net::HTTPResponse, and the prior URL as a string.

For example:

on 404 do |a_url, resp, prior_url|
  puts "URL not found: #{a_url}"
end

on :success do |a_url, resp, prior_url|
  puts a_url
  puts resp.body
end

on :every do |a_url, resp, prior_url|
  puts "Given this code: #{resp.code}"
end


123
124
125
126
127
128
129
130
131
# File 'lib/spider/spider_instance.rb', line 123

def on(code, p = nil, &block)
  f = p ? p : block
  case code
  when Integer
    @callbacks[code] = f
  else
    @callbacks[code.to_sym] = f
  end
end

#raw_headersObject

:nodoc:



152
153
154
# File 'lib/spider/spider_instance.rb', line 152

def raw_headers #:nodoc:
  @headers
end

#raw_headers=(v) ⇒ Object

:nodoc:



155
156
157
# File 'lib/spider/spider_instance.rb', line 155

def raw_headers=(v) #:nodoc:
  @headers = v
end

#remove_trailing_slash(s) ⇒ Object

:nodoc:



304
305
306
# File 'lib/spider/spider_instance.rb', line 304

def remove_trailing_slash(s) #:nodoc:
  s.sub(%r{/*$},'')
end

#setup(p = nil, &block) ⇒ Object

Run before the HTTP request. Given the URL as a string.

setup do |a_url|
  headers['Cookies'] = 'user_id=1;admin=true'
end


137
138
139
# File 'lib/spider/spider_instance.rb', line 137

def setup(p = nil, &block)
  @setup = p ? p : block
end

#start!Object

:nodoc:



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/spider/spider_instance.rb', line 164

def start! #:nodoc:
  trap("SIGINT") { @interrupted = true }
  begin
    next_urls = @next_urls.pop
    tmp_n_u = {}
    next_urls.each do |prior_url, urls|
      urls = [urls] unless urls.kind_of?(Array)
      urls.map do |a_url|
        [a_url, (URI.parse(a_url) rescue nil)]
      end.select do |a_url, parsed_url|
        allowable_url?(a_url, parsed_url)
      end.each do |a_url, parsed_url|
        @setup.call(a_url) unless @setup.nil?
        get_page(parsed_url) do |response|
          do_callbacks(a_url, response, prior_url)
          #tmp_n_u[a_url] = generate_next_urls(a_url, response)
          #@next_urls.push tmp_n_u
          generate_next_urls(a_url, response).each do |a_next_url|
            @next_urls.push a_url => a_next_url
          end
          #exit if interrupted
        end
        @teardown.call(a_url) unless @teardown.nil?
        break if @interrupted
      end
    end
  end while !@next_urls.empty? && !@interrupted
end

#stop!Object

:nodoc:



193
194
195
# File 'lib/spider/spider_instance.rb', line 193

def stop! #:nodoc:
  @interrupted = true
end

#store_next_urls_with(a_store) ⇒ Object

The Web is a really, really, really big graph; as such, this list of nodes to visit grows really, really, really big.

Change the object used to store nodes we have yet to walk. The default object is an instance of Array. Available with Spider is a wrapper of AmazonSQS.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just push and pop .

# default
store_next_urls_with Array.new

# AmazonSQS
require 'spider/next_urls_in_sqs'
store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)


93
94
95
96
97
98
99
# File 'lib/spider/spider_instance.rb', line 93

def store_next_urls_with(a_store)
  tmp_next_urls = @next_urls
  @next_urls = a_store
  tmp_next_urls.each do |a_url_hash|
    @next_urls.push a_url_hash
  end
end

#success_or_failure(code) ⇒ Object

:nodoc:



197
198
199
200
201
202
203
# File 'lib/spider/spider_instance.rb', line 197

def success_or_failure(code) #:nodoc:
  if code > 199 && code < 300
    :success
  else
    :failure
  end
end

#teardown(p = nil, &block) ⇒ Object

Run last, once for each page. Given the URL as a string.



142
143
144
# File 'lib/spider/spider_instance.rb', line 142

def teardown(p = nil, &block)
  @teardown = p ? p : block
end