Class: Polipus::Page
- Inherits:
-
Object
- Object
- Polipus::Page
- Defined in:
- lib/polipus/page.rb
Instance Attribute Summary collapse
-
#aliases ⇒ Object
Returns the value of attribute aliases.
-
#body ⇒ Object
readonly
The raw HTTP response body of the page.
-
#code ⇒ Object
Integer response code of the page.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#domain_aliases ⇒ Object
Returns the value of attribute domain_aliases.
-
#error ⇒ Object
readonly
Exception object, if one was raised during HTTP#fetch_page.
-
#fetched_at ⇒ Object
Returns the value of attribute fetched_at.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#redirect_to ⇒ Object
readonly
URL of the page this one redirected to, if any.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#storable ⇒ Object
Whether the current page should be stored Default: true.
-
#url ⇒ Object
readonly
The URL of the page.
-
#user_data ⇒ Object
OpenStruct it holds users defined data.
Class Method Summary collapse
Instance Method Summary collapse
-
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE.
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
-
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory.
-
#discard_links! ⇒ Object
Discard links, a next call of page.links will return an empty array.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
- #expired?(ttl) ⇒ Boolean
-
#fetched? ⇒ Boolean
Was the page successfully fetched?
true
if the page was fetched with no error,false
otherwise. -
#html? ⇒ Boolean
Returns
true
if the page is a HTML document, returnsfalse
otherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
true
if uri is in the same domain as the page, returnsfalse
otherwise. -
#initialize(url, params = {}) ⇒ Page
constructor
Create a new page.
-
#links ⇒ Object
Array of distinct A tag HREFs from the page.
-
#not_found? ⇒ Boolean
Returns
true
if the page was not found (returned 404 code), returnsfalse
otherwise. -
#redirect? ⇒ Boolean
Returns
true
if the page is a HTTP redirect, returnsfalse
otherwise. -
#storable? ⇒ Boolean
Returns
true
if page is marked as storeablefalse
otherwise Default istrue
. -
#success? ⇒ Boolean
Returns
true
if the page is a HTTP success, returnsfalse
otherwise. -
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
- #to_hash ⇒ Object
- #to_json ⇒ Object
Constructor Details
#initialize(url, params = {}) ⇒ Page
Create a new page
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/polipus/page.rb', line 42 def initialize(url, params = {}) @url = URI(url) @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? @user_data = OpenStruct.new @domain_aliases = params[:domain_aliases] ||= [] @storable = true @fetched_at = params[:fetched_at] end |
Instance Attribute Details
#aliases ⇒ Object
Returns the value of attribute aliases.
29 30 31 |
# File 'lib/polipus/page.rb', line 29 def aliases @aliases end |
#body ⇒ Object (readonly)
The raw HTTP response body of the page
11 12 13 |
# File 'lib/polipus/page.rb', line 11 def body @body end |
#code ⇒ Object
Integer response code of the page
19 20 21 |
# File 'lib/polipus/page.rb', line 19 def code @code end |
#depth ⇒ Object
Depth of this page from the root of the crawl.
21 22 23 |
# File 'lib/polipus/page.rb', line 21 def depth @depth end |
#domain_aliases ⇒ Object
Returns the value of attribute domain_aliases.
31 32 33 |
# File 'lib/polipus/page.rb', line 31 def domain_aliases @domain_aliases end |
#error ⇒ Object (readonly)
Exception object, if one was raised during HTTP#fetch_page
17 18 19 |
# File 'lib/polipus/page.rb', line 17 def error @error end |
#fetched_at ⇒ Object
Returns the value of attribute fetched_at.
37 38 39 |
# File 'lib/polipus/page.rb', line 37 def fetched_at @fetched_at end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
13 14 15 |
# File 'lib/polipus/page.rb', line 13 def headers @headers end |
#redirect_to ⇒ Object (readonly)
URL of the page this one redirected to, if any
15 16 17 |
# File 'lib/polipus/page.rb', line 15 def redirect_to @redirect_to end |
#referer ⇒ Object
URL of the page that brought us to this page
23 24 25 |
# File 'lib/polipus/page.rb', line 23 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
25 26 27 |
# File 'lib/polipus/page.rb', line 25 def response_time @response_time end |
#storable ⇒ Object
Whether the current page should be stored Default: true
35 36 37 |
# File 'lib/polipus/page.rb', line 35 def storable @storable end |
#url ⇒ Object (readonly)
The URL of the page
9 10 11 |
# File 'lib/polipus/page.rb', line 9 def url @url end |
#user_data ⇒ Object
OpenStruct it holds users defined data
27 28 29 |
# File 'lib/polipus/page.rb', line 27 def user_data @user_data end |
Class Method Details
.from_hash(hash) ⇒ Object
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
# File 'lib/polipus/page.rb', line 228 def self.from_hash(hash) page = self.new(URI(hash['url'])) { '@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']}, '@body' => hash['body'], '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [], '@code' => hash['code'].to_i, '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'], '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil, '@fetched_at' => hash['fetched_at'], '@error' => hash['error'] }.each do |var, value| page.instance_variable_set(var, value) end page end |
.from_json(json) ⇒ Object
249 250 251 252 |
# File 'lib/polipus/page.rb', line 249 def self.from_json(json) hash = JSON.parse json self.from_hash hash end |
Instance Method Details
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
152 153 154 155 156 157 158 159 160 |
# File 'lib/polipus/page.rb', line 152 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end |
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
112 113 114 |
# File 'lib/polipus/page.rb', line 112 def content_type headers['content-type'].first end |
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory
96 97 98 99 |
# File 'lib/polipus/page.rb', line 96 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end |
#discard_links! ⇒ Object
Discard links, a next call of page.links will return an empty array
89 90 91 |
# File 'lib/polipus/page.rb', line 89 def discard_links! @links = [] end |
#doc ⇒ Object
Nokogiri document for the HTML body
81 82 83 84 |
# File 'lib/polipus/page.rb', line 81 def doc return @doc if @doc @doc = Nokogiri::HTML(@body) if @body && html? rescue nil end |
#expired?(ttl) ⇒ Boolean
223 224 225 226 |
# File 'lib/polipus/page.rb', line 223 def expired? ttl return false if fetched_at.nil? (Time.now.to_i - ttl) > fetched_at end |
#fetched? ⇒ Boolean
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
105 106 107 |
# File 'lib/polipus/page.rb', line 105 def fetched? @fetched end |
#html? ⇒ Boolean
Returns true
if the page is a HTML document, returns false
otherwise.
120 121 122 |
# File 'lib/polipus/page.rb', line 120 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end |
#in_domain?(uri) ⇒ Boolean
Returns true
if uri is in the same domain as the page, returns false
otherwise
184 185 186 187 |
# File 'lib/polipus/page.rb', line 184 def in_domain?(uri) @domain_aliases ||= [] uri.host == @url.host || @domain_aliases.include?(uri.host) end |
#links ⇒ Object
Array of distinct A tag HREFs from the page
64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/polipus/page.rb', line 64 def links return @links.to_a unless @links.nil? @links = Set.new return [] if !doc doc.search("//a[@href]").each do |a| u = a['href'] next if u.nil? or u.empty? abs = to_absolute(u) rescue next @links << abs if in_domain?(abs) end @links.to_a end |
#not_found? ⇒ Boolean
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
144 145 146 |
# File 'lib/polipus/page.rb', line 144 def not_found? 404 == @code end |
#redirect? ⇒ Boolean
Returns true
if the page is a HTTP redirect, returns false
otherwise.
128 129 130 |
# File 'lib/polipus/page.rb', line 128 def redirect? (300..307).include?(@code) end |
#storable? ⇒ Boolean
Returns true
if page is marked as storeable false
otherwise Default is true
219 220 221 |
# File 'lib/polipus/page.rb', line 219 def storable? @storable end |
#success? ⇒ Boolean
Returns true
if the page is a HTTP success, returns false
otherwise.
136 137 138 |
# File 'lib/polipus/page.rb', line 136 def success? (200..206).include?(@code) end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/polipus/page.rb', line 166 def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end |
#to_hash ⇒ Object
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
# File 'lib/polipus/page.rb', line 189 def to_hash { 'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched, 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump, 'fetched_at' => @fetched_at, 'error' => @error } end |
#to_json ⇒ Object
207 208 209 210 211 212 |
# File 'lib/polipus/page.rb', line 207 def to_json th = to_hash.dup th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)} th.delete('headers') if content_type.empty? th.to_json end |