Class: Rwspider::Client
- Inherits:
-
Object
- Object
- Rwspider::Client
- Defined in:
- lib/rwspider/client.rb
Constant Summary collapse
- DEFAULT_OPTIONS =
follow_HTTP_redirection: Set as
true
to follow the HTTP redirectionstimeout: The timeout of single URL analysis
{ :useragent => 'RW Spider/' + Rwspider::VERSION, :robot_name => 'rwspider', :scan_documents_limit => 100, :scan_domain_limit => nil, :scan_images => false, :scan_other_files => false, :follow_robotstxt_directive => true, :follow_HTTP_redirection => true, :timeout => 5 }
Instance Attribute Summary collapse
-
#opts ⇒ Object
Hash of options for the spider job.
Instance Method Summary collapse
-
#initialize(options = {}) ⇒ Client
constructor
spider = Rwspider::Client::new(opts).
-
#start(start_url) ⇒ Object
Start the crawling from the
URL
.
Constructor Details
Instance Attribute Details
#opts ⇒ Object
Hash of options for the spider job
31 32 33 |
# File 'lib/rwspider/client.rb', line 31 def opts @opts end |
Instance Method Details
#start(start_url) ⇒ Object
Start the crawling from the URL
.
Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded. At the end of execution returns an Array
of Rwspider::Document instances.
Rwspider::Client::start('http://www.rwspider.com') {do |d|
puts 'Current URL ' + d.url.normalize.to_s
}
arr = Rwspider::Client::start('http://www.rwspider.com')
arr.each{do |d|
puts 'Current URL ' + d.url.normalize.to_s
}
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/rwspider/client.rb', line 107 def start (start_url) @queue << Rwspider::Document.new(start_url) @queue.each do |link| @main_url = link.url if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit] set_as_visited link @main_hostname = link.url.host.downcase if @main_hostname.length == 0 t = Thread.new(link) { |link| begin Timeout::timeout(@opts[:timeout]){ beginning = Time.now response = get_uri(link.url) link.download_time = Time.now - beginning link.as_downloaded = true link.http_response = response case response when Net::HTTPSuccess then if response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) ) link.get_links link.get_images if @opts[:scan_images] link.get_other_files if @opts[:scan_other_files] link.documents.each do |doc| add_to_queue doc end end when Net::HTTPRedirection then add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection] end } rescue Timeout::Error => e link.as_downloaded = false rescue StandardError => e link.as_downloaded = false end yield link if block_given? } t.join end end return @queue end |