Class: Sunbro::Page
- Inherits:
-
Object
- Object
- Sunbro::Page
- Defined in:
- lib/sunbro/page.rb
Instance Attribute Summary collapse
-
#body ⇒ Object
readonly
The raw HTTP response body of the page.
-
#code ⇒ Object
Integer response code of the page.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#error ⇒ Object
readonly
Exception object, if one was raised during HTTP#fetch_page.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#redirect_from ⇒ Object
Returns the value of attribute redirect_from.
-
#redirect_to ⇒ Object
readonly
URL of the page this one redirected to, if any.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#url ⇒ Object
The URL of the page.
-
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!.
Class Method Summary collapse
Instance Method Summary collapse
-
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE.
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
-
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
-
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
-
#fetched? ⇒ Boolean
Was the page successfully fetched?
trueif the page was fetched with no error,falseotherwise. -
#html? ⇒ Boolean
Returns
trueif the page is a HTML document, returnsfalseotherwise. -
#image? ⇒ Boolean
Returns
trueif the page is an image, returnsfalseotherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
trueif uri is in the same domain as the page, returnsfalseotherwise. -
#initialize(url, params = {}) ⇒ Page
constructor
Create a new page.
- #is_valid? ⇒ Boolean
- #marshal_dump ⇒ Object
- #marshal_load(ary) ⇒ Object
-
#not_found? ⇒ Boolean
Returns
trueif the page was not found (returned 404 code), returnsfalseotherwise. - #present? ⇒ Boolean
-
#redirect? ⇒ Boolean
Returns
trueif the page is a HTTP redirect, returnsfalseotherwise. -
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
- #to_hash ⇒ Object
-
#xml? ⇒ Boolean
Returns
trueif the page is a XML document, returnsfalseotherwise.
Constructor Details
#initialize(url, params = {}) ⇒ Page
Create a new page
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/sunbro/page.rb', line 32 def initialize(url, params = {}) @url = url @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @error = params[:error] @fetched = !params[:code].nil? @force_format = params[:force_format] @body = params[:body] @redirect_from = params[:redirect_from] end |
Instance Attribute Details
#body ⇒ Object (readonly)
The raw HTTP response body of the page
7 8 9 |
# File 'lib/sunbro/page.rb', line 7 def body @body end |
#code ⇒ Object
Integer response code of the page
16 17 18 |
# File 'lib/sunbro/page.rb', line 16 def code @code end |
#depth ⇒ Object
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.
21 22 23 |
# File 'lib/sunbro/page.rb', line 21 def depth @depth end |
#error ⇒ Object (readonly)
Exception object, if one was raised during HTTP#fetch_page
13 14 15 |
# File 'lib/sunbro/page.rb', line 13 def error @error end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
9 10 11 |
# File 'lib/sunbro/page.rb', line 9 def headers @headers end |
#redirect_from ⇒ Object
Returns the value of attribute redirect_from.
27 28 29 |
# File 'lib/sunbro/page.rb', line 27 def redirect_from @redirect_from end |
#redirect_to ⇒ Object (readonly)
URL of the page this one redirected to, if any
11 12 13 |
# File 'lib/sunbro/page.rb', line 11 def redirect_to @redirect_to end |
#referer ⇒ Object
URL of the page that brought us to this page
23 24 25 |
# File 'lib/sunbro/page.rb', line 23 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
25 26 27 |
# File 'lib/sunbro/page.rb', line 25 def response_time @response_time end |
#url ⇒ Object
The URL of the page
5 6 7 |
# File 'lib/sunbro/page.rb', line 5 def url @url end |
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
18 19 20 |
# File 'lib/sunbro/page.rb', line 18 def visited @visited end |
Class Method Details
.from_hash(hash) ⇒ Object
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
# File 'lib/sunbro/page.rb', line 209 def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => JSON.load(hash['headers']), '@body' => hash['body'], '@code' => hash['code'].to_i, '@error' => hash['error'], '@visited' => hash['visited'], '@referer' => hash['referer'], '@redirect_to' => (hash['redirect_to'].present?) ? URI(hash['redirect_to']) : nil, '@redirect_from' => (hash['redirect_from'].present?) ? URI(hash['redirect_from']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end |
Instance Method Details
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
148 149 150 151 152 153 154 155 156 |
# File 'lib/sunbro/page.rb', line 148 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end |
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
100 101 102 |
# File 'lib/sunbro/page.rb', line 100 def content_type headers['content-type'].first end |
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
93 94 95 |
# File 'lib/sunbro/page.rb', line 93 def WEBrick::Cookie.(@headers['Set-Cookie']) rescue [] end |
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory
78 79 80 |
# File 'lib/sunbro/page.rb', line 78 def discard_doc! @doc = @body = nil end |
#doc ⇒ Object
Nokogiri document for the HTML body
53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/sunbro/page.rb', line 53 def doc @doc ||= begin if image? nil elsif should_parse_as?(:xml) Nokogiri::XML(@body, @url.to_s) elsif should_parse_as?(:html) Nokogiri::HTML(@body, @url.to_s) elsif @body Nokogiri.parse(@body, @url.to_s) end end end |
#fetched? ⇒ Boolean
Was the page successfully fetched? true if the page was fetched with no error, false otherwise.
86 87 88 |
# File 'lib/sunbro/page.rb', line 86 def fetched? @fetched end |
#html? ⇒ Boolean
Returns true if the page is a HTML document, returns false otherwise.
116 117 118 |
# File 'lib/sunbro/page.rb', line 116 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end |
#image? ⇒ Boolean
Returns true if the page is an image, returns false otherwise.
108 109 110 |
# File 'lib/sunbro/page.rb', line 108 def image? !!(content_type =~ %r{^(image/)\b}) end |
#in_domain?(uri) ⇒ Boolean
Returns true if uri is in the same domain as the page, returns false otherwise
181 182 183 |
# File 'lib/sunbro/page.rb', line 181 def in_domain?(uri) uri.host == @url.host end |
#is_valid? ⇒ Boolean
67 68 69 |
# File 'lib/sunbro/page.rb', line 67 def is_valid? (url != "about:blank") && !not_found? && present? end |
#marshal_dump ⇒ Object
185 186 187 |
# File 'lib/sunbro/page.rb', line 185 def marshal_dump [@url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end |
#marshal_load(ary) ⇒ Object
189 190 191 |
# File 'lib/sunbro/page.rb', line 189 def marshal_load(ary) @url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end |
#not_found? ⇒ Boolean
Returns true if the page was not found (returned 404 code), returns false otherwise.
140 141 142 |
# File 'lib/sunbro/page.rb', line 140 def not_found? 404 == @code end |
#present? ⇒ Boolean
71 72 73 |
# File 'lib/sunbro/page.rb', line 71 def present? !error && code && body.present? && doc end |
#redirect? ⇒ Boolean
Returns true if the page is a HTTP redirect, returns false otherwise.
132 133 134 |
# File 'lib/sunbro/page.rb', line 132 def redirect? (300..307).include?(@code) end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/sunbro/page.rb', line 163 def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end |
#to_hash ⇒ Object
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/sunbro/page.rb', line 193 def to_hash { 'url' => @url.to_s, 'headers' => headers.to_json, 'body' => @body, 'code' => @code, 'error' => (@error ? @error.to_s : nil), 'visited' => @visited, 'referer' => (@referer ? @referer.to_s : nil), 'redirect_to' => (@redirect_to ? @redirect_to.to_s : nil), 'redirect_from' => (@redirect_from ? @redirect_from.to_s : nil), 'response_time' => @response_time, 'fetched' => @fetched }.reject { |k, v| v.nil? } end |
#xml? ⇒ Boolean
Returns true if the page is a XML document, returns false otherwise.
124 125 126 |
# File 'lib/sunbro/page.rb', line 124 def xml? !!(content_type =~ %r{^(text/xml|application/xml)\b}) end |