Class: Rwspider::Client

Inherits:

Object

Object
Rwspider::Client

Defined in:: lib/rwspider/client.rb

Constant Summary collapse

DEFAULT_OPTIONS = follow_HTTP_redirection: Set as true to follow the HTTP redirections timeout: The timeout of single URL analysis

{
  :useragent => 'RW Spider/' + Rwspider::VERSION,
  :robot_name => 'rwspider',
:scan_documents_limit => 100,
  :scan_domain_limit => nil,
  :scan_images => false,
  :scan_other_files => false,
  :follow_robotstxt_directive => true,
  :follow_HTTP_redirection => true,
			:timeout => 5
}

Instance Attribute Summary collapse

#opts ⇒ Object

Hash of options for the spider job.

Instance Method Summary collapse

#initialize(options = {}) ⇒ Client constructor

spider = Rwspider::Client::new(opts).
#start(start_url) ⇒ Object

Start the crawling from the URL.

Constructor Details

#initialize(options = {}) ⇒ `Client`

spider = Rwspider::Client::new(opts)

# File 'lib/rwspider/client.rb', line 83

def initialize (options = {})
	
    load_options options			
	@robotstxt_cache = Hash.new()			
    @main_hostname = ''
    @scanned_documents = 0
	@queue = Rwspider::Queue.new
	
end

Instance Attribute Details

#opts ⇒ `Object`

Hash of options for the spider job



31
32
33

# File 'lib/rwspider/client.rb', line 31

def opts
  @opts
end

Instance Method Details

#start(start_url) ⇒ `Object`

Start the crawling from the URL.

Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded. At the end of execution returns an Array of Rwspider::Document instances.

Rwspider::Client::start('http://www.rwspider.com') {do |d|
   puts 'Current URL ' + d.url.normalize.to_s
}

arr = Rwspider::Client::start('http://www.rwspider.com') 
arr.each{do |d|
   puts 'Current URL ' + d.url.normalize.to_s
}

# File 'lib/rwspider/client.rb', line 107

def start (start_url)
				
	@queue << Rwspider::Document.new(start_url)
	
	@queue.each do |link|
		@main_url = link.url
		if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit]
			set_as_visited link
			@main_hostname = link.url.host.downcase if @main_hostname.length == 0
			
			t = Thread.new(link) { |link|
				begin
					
					Timeout::timeout(@opts[:timeout]){	
						beginning = Time.now
						response = get_uri(link.url)
					  link.download_time = Time.now - beginning
						link.as_downloaded = true
						link.http_response = response
						
						case response
							when Net::HTTPSuccess then
							
							if  response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) )
								
								link.get_links
								link.get_images if @opts[:scan_images]
								link.get_other_files if @opts[:scan_other_files]
								
							  link.documents.each do |doc|
									add_to_queue doc
								end
							end
							
							when Net::HTTPRedirection then
							add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection] 
						
						end
					}
				rescue Timeout::Error => e
						link.as_downloaded = false
				rescue StandardError => e
						link.as_downloaded = false
				end
				yield link if block_given?
			} 
			t.join					
		end
	end
	
	return @queue
	
end