Class: Sunbro::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/sunbro/page.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, params = {}) ⇒ Page

Create a new page



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/sunbro/page.rb', line 32

def initialize(url, params = {})
  @url = url

  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @error = params[:error]
  @fetched = !params[:code].nil?
  @force_format = params[:force_format]
  @body = params[:body]
  @redirect_from = params[:redirect_from]
end

Instance Attribute Details

#bodyObject (readonly)

The raw HTTP response body of the page



7
8
9
# File 'lib/sunbro/page.rb', line 7

def body
  @body
end

#codeObject

Integer response code of the page



16
17
18
# File 'lib/sunbro/page.rb', line 16

def code
  @code
end

#depthObject

Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.



21
22
23
# File 'lib/sunbro/page.rb', line 21

def depth
  @depth
end

#errorObject (readonly)

Exception object, if one was raised during HTTP#fetch_page



13
14
15
# File 'lib/sunbro/page.rb', line 13

def error
  @error
end

#headersObject (readonly)

Headers of the HTTP response



9
10
11
# File 'lib/sunbro/page.rb', line 9

def headers
  @headers
end

#redirect_fromObject

Returns the value of attribute redirect_from.



27
28
29
# File 'lib/sunbro/page.rb', line 27

def redirect_from
  @redirect_from
end

#redirect_toObject (readonly)

URL of the page this one redirected to, if any



11
12
13
# File 'lib/sunbro/page.rb', line 11

def redirect_to
  @redirect_to
end

#refererObject

URL of the page that brought us to this page



23
24
25
# File 'lib/sunbro/page.rb', line 23

def referer
  @referer
end

#response_timeObject

Response time of the request for this page in milliseconds



25
26
27
# File 'lib/sunbro/page.rb', line 25

def response_time
  @response_time
end

#urlObject

The URL of the page



5
6
7
# File 'lib/sunbro/page.rb', line 5

def url
  @url
end

#visitedObject

Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!



18
19
20
# File 'lib/sunbro/page.rb', line 18

def visited
  @visited
end

Class Method Details

.from_hash(hash) ⇒ Object



209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/sunbro/page.rb', line 209

def self.from_hash(hash)
  page = self.new(URI(hash['url']))
  {'@headers'       => JSON.load(hash['headers']),
   '@body'          => hash['body'],
   '@code'          => hash['code'].to_i,
   '@error'         => hash['error'],
   '@visited'       => hash['visited'],
   '@referer'       => hash['referer'],
   '@redirect_to'   => (hash['redirect_to'].present?) ? URI(hash['redirect_to']) : nil,
   '@redirect_from' => (hash['redirect_from'].present?) ? URI(hash['redirect_from']) : nil,
   '@response_time' => hash['response_time'].to_i,
   '@fetched'       => hash['fetched']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end

Instance Method Details

#baseObject

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE



148
149
150
151
152
153
154
155
156
# File 'lib/sunbro/page.rb', line 148

def base
  @base = if doc
    href = doc.search('//head/base/@href')
    URI(href.to_s) unless href.nil? rescue nil
  end unless @base
  
  return nil if @base && @base.to_s().empty?
  @base
end

#content_typeObject

The content-type returned by the HTTP request for this page



100
101
102
# File 'lib/sunbro/page.rb', line 100

def content_type
  headers['content-type'].first
end

#cookiesObject

Array of cookies received with this page as WEBrick::Cookie objects.



93
94
95
# File 'lib/sunbro/page.rb', line 93

def cookies
  WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
end

#discard_doc!Object

Delete the Nokogiri document and response body to conserve memory



78
79
80
# File 'lib/sunbro/page.rb', line 78

def discard_doc!
  @doc = @body = nil
end

#docObject

Nokogiri document for the HTML body



53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/sunbro/page.rb', line 53

def doc
  @doc ||= begin
    if image?
      nil
    elsif should_parse_as?(:xml)
      Nokogiri::XML(@body, @url.to_s)
    elsif should_parse_as?(:html)
      Nokogiri::HTML(@body, @url.to_s)
    elsif @body
      Nokogiri.parse(@body, @url.to_s)
    end
  end
end

#fetched?Boolean

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

Returns:

  • (Boolean)


86
87
88
# File 'lib/sunbro/page.rb', line 86

def fetched?
  @fetched
end

#html?Boolean

Returns true if the page is a HTML document, returns false otherwise.

Returns:

  • (Boolean)


116
117
118
# File 'lib/sunbro/page.rb', line 116

def html?
  !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end

#image?Boolean

Returns true if the page is an image, returns false otherwise.

Returns:

  • (Boolean)


108
109
110
# File 'lib/sunbro/page.rb', line 108

def image?
  !!(content_type =~ %r{^(image/)\b})
end

#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

  • (Boolean)


181
182
183
# File 'lib/sunbro/page.rb', line 181

def in_domain?(uri)
  uri.host == @url.host
end

#is_valid?Boolean

Returns:

  • (Boolean)


67
68
69
# File 'lib/sunbro/page.rb', line 67

def is_valid?
  (url != "about:blank") && !not_found? && present?
end

#marshal_dumpObject



185
186
187
# File 'lib/sunbro/page.rb', line 185

def marshal_dump
  [@url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end

#marshal_load(ary) ⇒ Object



189
190
191
# File 'lib/sunbro/page.rb', line 189

def marshal_load(ary)
  @url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end

#not_found?Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

  • (Boolean)


140
141
142
# File 'lib/sunbro/page.rb', line 140

def not_found?
  404 == @code
end

#present?Boolean

Returns:

  • (Boolean)


71
72
73
# File 'lib/sunbro/page.rb', line 71

def present?
  !error && code && body.present? && doc
end

#redirect?Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

  • (Boolean)


132
133
134
# File 'lib/sunbro/page.rb', line 132

def redirect?
  (300..307).include?(@code)
end

#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page



163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/sunbro/page.rb', line 163

def to_absolute(link)
  return nil if link.nil?

  # remove anchor
  link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))

  relative = URI(link)
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  return absolute
end

#to_hashObject



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/sunbro/page.rb', line 193

def to_hash
  {
    'url'           => @url.to_s,
    'headers'       => headers.to_json,
    'body'          => @body,
    'code'          => @code,
    'error'         => (@error ? @error.to_s : nil),
    'visited'       => @visited,
    'referer'       => (@referer ? @referer.to_s : nil),
    'redirect_to'   => (@redirect_to ? @redirect_to.to_s : nil),
    'redirect_from' => (@redirect_from ? @redirect_from.to_s : nil),
    'response_time' => @response_time,
    'fetched'       => @fetched
  }.reject { |k, v| v.nil? }
end

#xml?Boolean

Returns true if the page is a XML document, returns false otherwise.

Returns:

  • (Boolean)


124
125
126
# File 'lib/sunbro/page.rb', line 124

def xml?
  !!(content_type =~ %r{^(text/xml|application/xml)\b})
end