Class: SpiderInstance

Inherits:
Object
  • Object
show all
Defined in:
lib/spider/spider_instance.rb

Defined Under Namespace

Classes: HeaderSetter

Instance Method Summary collapse

Constructor Details

#initialize(next_urls, seen = [], rules = nil, robots_seen = []) ⇒ SpiderInstance

:nodoc:



27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/spider/spider_instance.rb', line 27

def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
  @url_checks  = []
  @cache       = :memory
  @callbacks   = {}
  @next_urls   = [next_urls]
  @seen        = seen
  @rules       = rules || RobotRules.new("Ruby Spider #{Spider::VERSION}")
  @robots_seen = robots_seen
  @headers     = {}
  @setup       = nil
  @teardown    = nil
end

Instance Method Details

#add_url_check(&block) ⇒ Object

Add a predicate that determines whether to continue down this URL’s path. All predicates must be true in order for a URL to proceed.

Takes a block that takes a string and produces a boolean. For example, this will ensure that the URL starts with ‘cashcats.biz’:

add_url_check { |a_url| a_url =~ %r{^http://cashcats.biz.*}


47
48
49
# File 'lib/spider/spider_instance.rb', line 47

def add_url_check(&block)
  @url_checks << block
end

#allowable_url?(a_url, parsed_url) ⇒ Boolean

:nodoc:

Returns:

  • (Boolean)


201
202
203
204
# File 'lib/spider/spider_instance.rb', line 201

def allowable_url?(a_url, parsed_url) #:nodoc:
  !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
    @url_checks.map{|url_check|url_check.call(a_url)}.all?
end

#allowed?(a_url, parsed_url) ⇒ Boolean

True if the robots.txt for that URL allows access to it.

Returns:

  • (Boolean)


207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/spider/spider_instance.rb', line 207

def allowed?(a_url, parsed_url) # :nodoc:
  return false unless ['http','https'].include?(parsed_url.scheme)
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
  parsed_u = URI.parse(u)
  return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
  begin
    unless @robots_seen.include?(u)
      #open(u, 'User-Agent' => 'Ruby Spider',
      #  'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
      #  @rules.parse(u, url.read)
      #end
      get_page(parsed_u) do |r|
        @rules.parse(u, r.body)
      end
      @robots_seen << u
    end
    @rules.allowed?(a_url)
  rescue OpenURI::HTTPError
    true # No robots.txt
  rescue Exception, Timeout::Error # to keep it from crashing
    false
  end
end

#check_already_seen_with(cacher) ⇒ Object

The Web is a graph; to avoid cycles we store the nodes (URLs) already visited. The Web is a really, really, really big graph; as such, this list of visited nodes grows really, really, really big.

Change the object used to store these seen nodes with this. The default object is an instance of Array. Available with Spider is a wrapper of memcached.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just << and included? .

# default
check_already_seen_with Array.new

# memcached
require 'spider/included_in_memcached'
check_already_seen_with IncludedInMemcached.new('localhost:11211')


68
69
70
71
72
73
74
# File 'lib/spider/spider_instance.rb', line 68

def check_already_seen_with(cacher)
  if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
    @seen = cacher
  else
    raise ArgumentError, 'expected something that responds to << and included?'
  end
end

#clear_headersObject

Reset the headers hash.



159
160
161
# File 'lib/spider/spider_instance.rb', line 159

def clear_headers
  @headers = {}
end

#construct_complete_url(base_url, additional_url, parsed_additional_url = nil) ⇒ Object

:nodoc:



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# File 'lib/spider/spider_instance.rb', line 281

def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
  parsed_additional_url ||= URI.parse(additional_url)
  case parsed_additional_url.scheme
    when nil
      u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
      if additional_url[0].chr == '/'
        "#{u.scheme}://#{u.host}#{additional_url}"
      elsif u.path.nil? || u.path == ''
        "#{u.scheme}://#{u.host}/#{additional_url}"
      elsif u.path[0].chr == '/'
        "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
      else
        "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
      end
  else
    additional_url
  end
end

#do_callbacks(a_url, resp, prior_url) ⇒ Object

:nodoc:



252
253
254
255
256
257
258
259
260
# File 'lib/spider/spider_instance.rb', line 252

def do_callbacks(a_url, resp, prior_url) #:nodoc:
  cbs = [@callbacks[:every],
    resp.success? ?  @callbacks[:success] : @callbacks[:failure],
    @callbacks[resp.code]]

  cbs.each do |cb|
    cb.call(a_url, resp, prior_url) if cb
  end
end

#generate_next_urls(a_url, resp) ⇒ Object

:nodoc:



262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/spider/spider_instance.rb', line 262

def generate_next_urls(a_url, resp) #:nodoc:
  web_page = resp.body
  base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
              [a_url[0,a_url.rindex('/')]])[0]
  base_url = remove_trailing_slash(base_url)
  web_page.scan(/href="(.*?)"/i).flatten.map do |link|
    begin
      parsed_link = URI.parse(link)
      if parsed_link.fragment == '#'
        nil
      else
        construct_complete_url(base_url, link, parsed_link)
      end
    rescue
      nil
    end
  end.compact
end

#get_page(parsed_url, &block) ⇒ Object

:nodoc:



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/spider/spider_instance.rb', line 231

def get_page(parsed_url, &block) #:nodoc:
  @seen << parsed_url
  begin
    http = Net::HTTP.new(parsed_url.host, parsed_url.port)
    if parsed_url.scheme == 'https'
      http.use_ssl = true
      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
    end
    # Uses start because http.finish cannot be called.
    r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
    if r.redirect?
      get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
    else
      block.call(r)
    end
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
    p e
    nil
  end
end

#headersObject

Use like a hash:

headers['Cookies'] = 'user_id=1;password=btrross3'


147
148
149
# File 'lib/spider/spider_instance.rb', line 147

def headers
  HeaderSetter.new(self)
end

#on(code, p = nil, &block) ⇒ Object

Add a response handler. A response handler’s trigger can be :every, :success, :failure, or any HTTP status code. The handler itself can be either a Proc or a block.

The arguments to the block are: the URL as a string, an instance of Net::HTTPResponse, and the prior URL as a string.

For example:

on 404 do |a_url, resp, prior_url|
  puts "URL not found: #{a_url}"
end

on :success do |a_url, resp, prior_url|
  puts a_url
  puts resp.body
end

on :every do |a_url, resp, prior_url|
  puts "Given this code: #{resp.code}"
end


122
123
124
125
126
127
128
129
130
# File 'lib/spider/spider_instance.rb', line 122

def on(code, p = nil, &block)
  f = p ? p : block
  case code
  when Fixnum
    @callbacks[code] = f
  else
    @callbacks[code.to_sym] = f
  end
end

#raw_headersObject

:nodoc:



151
152
153
# File 'lib/spider/spider_instance.rb', line 151

def raw_headers #:nodoc:
  @headers
end

#raw_headers=(v) ⇒ Object

:nodoc:



154
155
156
# File 'lib/spider/spider_instance.rb', line 154

def raw_headers=(v) #:nodoc:
  @headers = v
end

#remove_trailing_slash(s) ⇒ Object

:nodoc:



300
301
302
# File 'lib/spider/spider_instance.rb', line 300

def remove_trailing_slash(s) #:nodoc:
  s.sub(%r{/*$},'')
end

#setup(p = nil, &block) ⇒ Object

Run before the HTTP request. Given the URL as a string.

setup do |a_url|
  headers['Cookies'] = 'user_id=1;admin=true'
end


136
137
138
# File 'lib/spider/spider_instance.rb', line 136

def setup(p = nil, &block)
  @setup = p ? p : block
end

#start!Object

:nodoc:



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/spider/spider_instance.rb', line 163

def start! #:nodoc:
  interrupted = false
  trap("SIGINT") { interrupted = true }
  begin
    next_urls = @next_urls.pop
    tmp_n_u = {}
    next_urls.each do |prior_url, urls|
      urls = [urls] unless urls.kind_of?(Array)
      urls.map do |a_url|
        [a_url, (URI.parse(a_url) rescue nil)]
      end.select do |a_url, parsed_url|
        allowable_url?(a_url, parsed_url)
      end.each do |a_url, parsed_url|
        @setup.call(a_url) unless @setup.nil?
        get_page(parsed_url) do |response|
          do_callbacks(a_url, response, prior_url)
          #tmp_n_u[a_url] = generate_next_urls(a_url, response)
          #@next_urls.push tmp_n_u
          generate_next_urls(a_url, response).each do |a_next_url|
            @next_urls.push a_url => a_next_url
          end
          #exit if interrupted
        end
        @teardown.call(a_url) unless @teardown.nil?
        exit if interrupted
      end
    end
  end while !@next_urls.empty?
end

#store_next_urls_with(a_store) ⇒ Object

The Web is a really, really, really big graph; as such, this list of nodes to visit grows really, really, really big.

Change the object used to store nodes we have yet to walk. The default object is an instance of Array. Available with Spider is a wrapper of AmazonSQS.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just push and pop .

# default
store_next_urls_with Array.new

# AmazonSQS
require 'spider/next_urls_in_sqs'
store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)


92
93
94
95
96
97
98
# File 'lib/spider/spider_instance.rb', line 92

def store_next_urls_with(a_store)
  tmp_next_urls = @next_urls
  @next_urls = a_store
  tmp_next_urls.each do |a_url_hash|
    @next_urls.push a_url_hash
  end
end

#success_or_failure(code) ⇒ Object

:nodoc:



193
194
195
196
197
198
199
# File 'lib/spider/spider_instance.rb', line 193

def success_or_failure(code) #:nodoc:
  if code > 199 && code < 300
    :success
  else
    :failure
  end
end

#teardown(p = nil, &block) ⇒ Object

Run last, once for each page. Given the URL as a string.



141
142
143
# File 'lib/spider/spider_instance.rb', line 141

def teardown(p = nil, &block)
  @teardown = p ? p : block
end