Class: Polipus::Page
- Inherits:
-
Object
- Object
- Polipus::Page
- Defined in:
- lib/polipus/page.rb
Instance Attribute Summary collapse
-
#aliases ⇒ Object
Returns the value of attribute aliases.
-
#body ⇒ Object
readonly
The raw HTTP response body of the page.
-
#code ⇒ Object
Integer response code of the page.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#domain_aliases ⇒ Object
Returns the value of attribute domain_aliases.
-
#error ⇒ Object
readonly
Exception object, if one was raised during HTTP#fetch_page.
-
#fetched_at ⇒ Object
Returns the value of attribute fetched_at.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#redirect_to ⇒ Object
readonly
URL of the page this one redirected to, if any.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#storable ⇒ Object
Whether the current page should be stored Default: true.
-
#url ⇒ Object
readonly
The URL of the page.
-
#user_data ⇒ Object
OpenStruct it holds users defined data.
Class Method Summary collapse
Instance Method Summary collapse
-
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE.
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
-
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory.
-
#discard_links! ⇒ Object
Discard links, a next call of page.links will return an empty array.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
- #expired?(ttl) ⇒ Boolean
-
#fetched? ⇒ Boolean
Was the page successfully fetched?
true
if the page was fetched with no error,false
otherwise. -
#html? ⇒ Boolean
Returns
true
if the page is a HTML document, returnsfalse
otherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
true
if uri is in the same domain as the page, returnsfalse
otherwise. -
#initialize(url, params = {}) ⇒ Page
constructor
Create a new page.
-
#links ⇒ Object
Array of distinct A tag HREFs from the page.
-
#not_found? ⇒ Boolean
Returns
true
if the page was not found (returned 404 code), returnsfalse
otherwise. -
#redirect? ⇒ Boolean
Returns
true
if the page is a HTTP redirect, returnsfalse
otherwise. -
#storable? ⇒ Boolean
Returns
true
if page is marked as storeablefalse
otherwise Default istrue
. -
#success? ⇒ Boolean
Returns
true
if the page is a HTTP success, returnsfalse
otherwise. -
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
- #to_hash ⇒ Object
- #to_json ⇒ Object
Constructor Details
#initialize(url, params = {}) ⇒ Page
Create a new page
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/polipus/page.rb', line 44 def initialize(url, params = {}) @url = URI(url) @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? @user_data = OpenStruct.new @domain_aliases = params[:domain_aliases] ||= [] @storable = true @fetched_at = params[:fetched_at] end |
Instance Attribute Details
#aliases ⇒ Object
Returns the value of attribute aliases
31 32 33 |
# File 'lib/polipus/page.rb', line 31 def aliases @aliases end |
#body ⇒ Object (readonly)
The raw HTTP response body of the page
13 14 15 |
# File 'lib/polipus/page.rb', line 13 def body @body end |
#code ⇒ Object
Integer response code of the page
21 22 23 |
# File 'lib/polipus/page.rb', line 21 def code @code end |
#depth ⇒ Object
Depth of this page from the root of the crawl.
23 24 25 |
# File 'lib/polipus/page.rb', line 23 def depth @depth end |
#domain_aliases ⇒ Object
Returns the value of attribute domain_aliases
33 34 35 |
# File 'lib/polipus/page.rb', line 33 def domain_aliases @domain_aliases end |
#error ⇒ Object (readonly)
Exception object, if one was raised during HTTP#fetch_page
19 20 21 |
# File 'lib/polipus/page.rb', line 19 def error @error end |
#fetched_at ⇒ Object
Returns the value of attribute fetched_at
39 40 41 |
# File 'lib/polipus/page.rb', line 39 def fetched_at @fetched_at end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
15 16 17 |
# File 'lib/polipus/page.rb', line 15 def headers @headers end |
#redirect_to ⇒ Object (readonly)
URL of the page this one redirected to, if any
17 18 19 |
# File 'lib/polipus/page.rb', line 17 def redirect_to @redirect_to end |
#referer ⇒ Object
URL of the page that brought us to this page
25 26 27 |
# File 'lib/polipus/page.rb', line 25 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
27 28 29 |
# File 'lib/polipus/page.rb', line 27 def response_time @response_time end |
#storable ⇒ Object
Whether the current page should be stored Default: true
37 38 39 |
# File 'lib/polipus/page.rb', line 37 def storable @storable end |
#url ⇒ Object (readonly)
The URL of the page
11 12 13 |
# File 'lib/polipus/page.rb', line 11 def url @url end |
#user_data ⇒ Object
OpenStruct it holds users defined data
29 30 31 |
# File 'lib/polipus/page.rb', line 29 def user_data @user_data end |
Class Method Details
.from_hash(hash) ⇒ Object
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
# File 'lib/polipus/page.rb', line 245 def self.from_hash(hash) page = new(URI(hash['url'])) { '@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] }, '@body' => hash['body'], '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [], '@code' => hash['code'].to_i, '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'], '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil, '@fetched_at' => hash['fetched_at'], '@error' => hash['error'] }.each do |var, value| page.instance_variable_set(var, value) end page end |
.from_json(json) ⇒ Object
266 267 268 269 |
# File 'lib/polipus/page.rb', line 266 def self.from_json(json) hash = JSON.parse json from_hash hash end |
Instance Method Details
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
157 158 159 160 161 162 163 164 165 |
# File 'lib/polipus/page.rb', line 157 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s.empty? @base end |
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
117 118 119 |
# File 'lib/polipus/page.rb', line 117 def content_type headers['content-type'].first end |
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory
101 102 103 104 |
# File 'lib/polipus/page.rb', line 101 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end |
#discard_links! ⇒ Object
Discard links, a next call of page.links will return an empty array
94 95 96 |
# File 'lib/polipus/page.rb', line 94 def discard_links! @links = [] end |
#doc ⇒ Object
Nokogiri document for the HTML body
83 84 85 86 87 88 89 |
# File 'lib/polipus/page.rb', line 83 def doc return @doc if @doc @body ||= '' @body = @body.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '') @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? end |
#expired?(ttl) ⇒ Boolean
240 241 242 243 |
# File 'lib/polipus/page.rb', line 240 def expired?(ttl) return false if fetched_at.nil? (Time.now.to_i - ttl) > fetched_at end |
#fetched? ⇒ Boolean
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
110 111 112 |
# File 'lib/polipus/page.rb', line 110 def fetched? @fetched end |
#html? ⇒ Boolean
Returns true
if the page is a HTML document, returns false
otherwise.
125 126 127 |
# File 'lib/polipus/page.rb', line 125 def html? content_type =~ %r{^(text/html|application/xhtml+xml)\b} end |
#in_domain?(uri) ⇒ Boolean
Returns true
if uri is in the same domain as the page, returns false
otherwise
201 202 203 204 |
# File 'lib/polipus/page.rb', line 201 def in_domain?(uri) @domain_aliases ||= [] uri.host == @url.host || @domain_aliases.include?(uri.host) end |
#links ⇒ Object
Array of distinct A tag HREFs from the page
66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/polipus/page.rb', line 66 def links return @links.to_a unless @links.nil? @links = Set.new return [] unless doc doc.search('//a[@href]').each do |a| u = a['href'] next if u.nil? || u.empty? abs = to_absolute(u) rescue next @links << abs if abs && in_domain?(abs) end @links.to_a end |
#not_found? ⇒ Boolean
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
149 150 151 |
# File 'lib/polipus/page.rb', line 149 def not_found? 404 == @code end |
#redirect? ⇒ Boolean
Returns true
if the page is a HTTP redirect, returns false
otherwise.
133 134 135 |
# File 'lib/polipus/page.rb', line 133 def redirect? (300...400).include?(@code) end |
#storable? ⇒ Boolean
Returns true
if page is marked as storeable false
otherwise Default is true
236 237 238 |
# File 'lib/polipus/page.rb', line 236 def storable? @storable end |
#success? ⇒ Boolean
Returns true
if the page is a HTTP success, returns false
otherwise.
141 142 143 |
# File 'lib/polipus/page.rb', line 141 def success? (200..206).include?(@code) end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# File 'lib/polipus/page.rb', line 171 def to_absolute(link) return nil if link.nil? valid_link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '') # remove anchor link = begin URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, ''))) rescue URI::Error return nil end relative = begin URI(link) rescue URI::Error return nil end absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? absolute end |
#to_hash ⇒ Object
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
# File 'lib/polipus/page.rb', line 206 def to_hash { 'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched, 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump, 'fetched_at' => @fetched_at, 'error' => @error.to_s } end |
#to_json ⇒ Object
224 225 226 227 228 229 |
# File 'lib/polipus/page.rb', line 224 def to_json th = to_hash.dup th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) } th.delete('headers') if content_type.empty? th.to_json end |