Class: Rwspider::Document
- Inherits:
-
Object
- Object
- Rwspider::Document
- Includes:
- URI
- Defined in:
- lib/rwspider/document.rb
Instance Attribute Summary collapse
-
#as_downloaded ⇒ Object
Returns
true
if the Rwspider::Document::url was downloaded correctly. -
#as_visited ⇒ Object
Returns
true
if the Rwspider::Document::url was request. -
#documents ⇒ Object
An
Array
of Rwspider::Document found in the HTML code of the current Rwspider::Document. -
#download_time ⇒ Object
The time spent to download the Rwspider::Document.
-
#http_response ⇒ Object
An instance of Net::HTTPResponse that contains the response returned from the web server.
-
#inbound_links ⇒ Object
readonly
An
Array
ofString
hat contains the URLs of the documents where was found an link at the current Rwspider::Document. -
#url ⇒ Object
instance of
URI
.
Instance Method Summary collapse
-
#get_images ⇒ Object
was different from ‘text/html’.
-
#get_links ⇒ Object
was different from ‘text/html’.
-
#get_other_files ⇒ Object
was different from ‘text/html’.
-
#initialize(url) ⇒ Document
constructor
doc = Rwspider::Document::new(‘www.rwspider.com’).
-
#normalize_url(var) ⇒ Object
doc.normalize_url(URI.parse(‘/sitemap.html’)).
-
#parse(url) ⇒ Object
doc.parse(‘www.rwspider.com/sitemap.html’).
Constructor Details
#initialize(url) ⇒ Document
doc = Rwspider::Document::new(‘www.rwspider.com’)
49 50 51 52 53 54 55 56 57 58 |
# File 'lib/rwspider/document.rb', line 49 def initialize (url) parse(url) @tag_type = Array.new @tag_type << ['a','href'] @tag_type << ['img','src'] @tag_type << ['link','href'] @inbound_links = [] @documents = [] @as_visited = false end |
Instance Attribute Details
#as_downloaded ⇒ Object
Returns true
if the Rwspider::Document::url was downloaded correctly
39 40 41 |
# File 'lib/rwspider/document.rb', line 39 def as_downloaded @as_downloaded end |
#as_visited ⇒ Object
Returns true
if the Rwspider::Document::url was request
27 28 29 |
# File 'lib/rwspider/document.rb', line 27 def as_visited @as_visited end |
#documents ⇒ Object
An Array
of Rwspider::Document found in the HTML code of the current Rwspider::Document
33 34 35 |
# File 'lib/rwspider/document.rb', line 33 def documents @documents end |
#download_time ⇒ Object
The time spent to download the Rwspider::Document
36 37 38 |
# File 'lib/rwspider/document.rb', line 36 def download_time @download_time end |
#http_response ⇒ Object
An instance of Net::HTTPResponse that contains the response returned from the web server
30 31 32 |
# File 'lib/rwspider/document.rb', line 30 def http_response @http_response end |
#inbound_links ⇒ Object (readonly)
An Array
of String
hat contains the URLs of the documents where was found an link at the current Rwspider::Document
42 43 44 |
# File 'lib/rwspider/document.rb', line 42 def inbound_links @inbound_links end |
#url ⇒ Object
instance of URI
24 25 26 |
# File 'lib/rwspider/document.rb', line 24 def url @url end |
Instance Method Details
#get_images ⇒ Object
was different from ‘text/html’.
101 102 103 |
# File 'lib/rwspider/document.rb', line 101 def get_images() get_document(@tag_type[1]) end |
#get_links ⇒ Object
was different from ‘text/html’.
85 86 87 |
# File 'lib/rwspider/document.rb', line 85 def get_links() get_document(@tag_type[0]) end |
#get_other_files ⇒ Object
was different from ‘text/html’.
118 119 120 |
# File 'lib/rwspider/document.rb', line 118 def get_other_files() get_document(@tag_type[2]) end |
#normalize_url(var) ⇒ Object
doc.normalize_url(URI.parse(‘/sitemap.html’))
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/rwspider/document.rb', line 127 def normalize_url(var) querystring = (!var.query.nil?) ? '?' + var.query : '' if var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript") if var.relative? path = var.path if url.path.nil? main_path = url.path.slice(0..url.path.rindex('/')) else main_path = '/' end if path.match('^\/') path = url.scheme + '://' + url.host + path + querystring else path = url.scheme + '://' + url.host + main_path + path + querystring end else path = var.scheme + '://' + var.host + var.path + querystring end end return path end |
#parse(url) ⇒ Object
doc.parse(‘www.rwspider.com/sitemap.html’)
65 66 67 68 69 70 71 |
# File 'lib/rwspider/document.rb', line 65 def parse (url) begin @url = URI.parse(url.gsub(/\\/,'/')) rescue Exception => e nil end end |