Class: Rwspider::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/rwspider/client.rb

Constant Summary collapse

DEFAULT_OPTIONS =

follow_HTTP_redirection: Set as true to follow the HTTP redirections

timeout: The timeout of single URL analysis

{
  :useragent => 'RW Spider/' + Rwspider::VERSION,
  :robot_name => 'rwspider',
:scan_documents_limit => 100,
  :scan_domain_limit => nil,
  :scan_images => false,
  :scan_other_files => false,
  :follow_robotstxt_directive => true,
  :follow_HTTP_redirection => true,
			:timeout => 5
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Client

spider = Rwspider::Client::new(opts)



83
84
85
86
87
88
89
90
91
# File 'lib/rwspider/client.rb', line 83

def initialize (options = {})
	
    load_options options			
	@robotstxt_cache = Hash.new()			
    @main_hostname = ''
    @scanned_documents = 0
	@queue = Rwspider::Queue.new
	
end

Instance Attribute Details

#optsObject

Hash of options for the spider job



31
32
33
# File 'lib/rwspider/client.rb', line 31

def opts
  @opts
end

Instance Method Details

#start(start_url) ⇒ Object

Start the crawling from the URL.

Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded. At the end of execution returns an Array of Rwspider::Document instances.

Rwspider::Client::start('http://www.rwspider.com') {do |d|
   puts 'Current URL ' + d.url.normalize.to_s
}

arr = Rwspider::Client::start('http://www.rwspider.com') 
arr.each{do |d|
   puts 'Current URL ' + d.url.normalize.to_s
}


107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/rwspider/client.rb', line 107

def start (start_url)
				
	@queue << Rwspider::Document.new(start_url)
	
	@queue.each do |link|
		@main_url = link.url
		if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit]
			set_as_visited link
			@main_hostname = link.url.host.downcase if @main_hostname.length == 0
			
			t = Thread.new(link) { |link|
				begin
					
					Timeout::timeout(@opts[:timeout]){	
						beginning = Time.now
						response = get_uri(link.url)
					  link.download_time = Time.now - beginning
						link.as_downloaded = true
						link.http_response = response
						
						case response
							when Net::HTTPSuccess then
							
							if  response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) )
								
								link.get_links
								link.get_images if @opts[:scan_images]
								link.get_other_files if @opts[:scan_other_files]
								
							  link.documents.each do |doc|
									add_to_queue doc
								end
							end
							
							when Net::HTTPRedirection then
							add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection] 
						
						end
					}
				rescue Timeout::Error => e
						link.as_downloaded = false
				rescue StandardError => e
						link.as_downloaded = false
				end
				yield link if block_given?
			} 
			t.join					
		end
	end
	
	return @queue
	
end