Class: Polipus::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/polipus/page.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, params = {}) ⇒ Page

Create a new page



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/polipus/page.rb', line 41

def initialize(url, params = {})
  @url = URI(url)
  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]
  @fetched = !params[:code].nil?
  @user_data = OpenStruct.new
  @domain_aliases = params[:domain_aliases] ||= []
  @storable = true
  @fetched_at = params[:fetched_at]
end

Instance Attribute Details

#aliasesObject

Returns the value of attribute aliases.



28
29
30
# File 'lib/polipus/page.rb', line 28

def aliases
  @aliases
end

#bodyObject (readonly)

The raw HTTP response body of the page



10
11
12
# File 'lib/polipus/page.rb', line 10

def body
  @body
end

#codeObject

Integer response code of the page



18
19
20
# File 'lib/polipus/page.rb', line 18

def code
  @code
end

#depthObject

Depth of this page from the root of the crawl.



20
21
22
# File 'lib/polipus/page.rb', line 20

def depth
  @depth
end

#domain_aliasesObject

Returns the value of attribute domain_aliases.



30
31
32
# File 'lib/polipus/page.rb', line 30

def domain_aliases
  @domain_aliases
end

#errorObject (readonly)

Exception object, if one was raised during HTTP#fetch_page



16
17
18
# File 'lib/polipus/page.rb', line 16

def error
  @error
end

#fetched_atObject

Returns the value of attribute fetched_at.



36
37
38
# File 'lib/polipus/page.rb', line 36

def fetched_at
  @fetched_at
end

#headersObject (readonly)

Headers of the HTTP response



12
13
14
# File 'lib/polipus/page.rb', line 12

def headers
  @headers
end

#redirect_toObject (readonly)

URL of the page this one redirected to, if any



14
15
16
# File 'lib/polipus/page.rb', line 14

def redirect_to
  @redirect_to
end

#refererObject

URL of the page that brought us to this page



22
23
24
# File 'lib/polipus/page.rb', line 22

def referer
  @referer
end

#response_timeObject

Response time of the request for this page in milliseconds



24
25
26
# File 'lib/polipus/page.rb', line 24

def response_time
  @response_time
end

#storableObject

Whether the current page should be stored Default: true



34
35
36
# File 'lib/polipus/page.rb', line 34

def storable
  @storable
end

#urlObject (readonly)

The URL of the page



8
9
10
# File 'lib/polipus/page.rb', line 8

def url
  @url
end

#user_dataObject

OpenStruct it holds users defined data



26
27
28
# File 'lib/polipus/page.rb', line 26

def user_data
  @user_data
end

Class Method Details

.from_hash(hash) ⇒ Object



227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/polipus/page.rb', line 227

def self.from_hash(hash)
  page = new(URI(hash['url']))
  {
    '@headers'       => hash['headers'] ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
    '@body'          => hash['body'],
    '@links'         => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
    '@code'          => hash['code'].to_i,
    '@depth'         => hash['depth'].to_i,
    '@referer'       => hash['referer'],
    '@redirect_to'   => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
    '@response_time' => hash['response_time'].to_i,
    '@fetched'       => hash['fetched'],
    '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
    '@fetched_at'    => hash['fetched_at'],
    '@error'         => hash['error']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end

.from_json(json) ⇒ Object



248
249
250
251
# File 'lib/polipus/page.rb', line 248

def self.from_json(json)
  hash = JSON.parse json
  from_hash hash
end

Instance Method Details

#baseObject

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE



151
152
153
154
155
156
157
158
159
# File 'lib/polipus/page.rb', line 151

def base
  @base = if doc
            href = doc.search('//head/base/@href')
            URI(href.to_s) unless href.nil? rescue nil
          end unless @base

  return nil if @base && @base.to_s.empty?
  @base
end

#content_typeObject

The content-type returned by the HTTP request for this page



111
112
113
# File 'lib/polipus/page.rb', line 111

def content_type
  headers['content-type'].first
end

#discard_doc!Object

Delete the Nokogiri document and response body to conserve memory



95
96
97
98
# File 'lib/polipus/page.rb', line 95

def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end

#discard_links!Object

Discard links, a next call of page.links will return an empty array



88
89
90
# File 'lib/polipus/page.rb', line 88

def discard_links!
  @links = []
end

#docObject

Nokogiri document for the HTML body



80
81
82
83
# File 'lib/polipus/page.rb', line 80

def doc
  return @doc if @doc
  @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
end

#expired?(ttl) ⇒ Boolean

Returns:

  • (Boolean)


222
223
224
225
# File 'lib/polipus/page.rb', line 222

def expired?(ttl)
  return false if fetched_at.nil?
  (Time.now.to_i - ttl) > fetched_at
end

#fetched?Boolean

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

Returns:

  • (Boolean)


104
105
106
# File 'lib/polipus/page.rb', line 104

def fetched?
  @fetched
end

#html?Boolean

Returns true if the page is a HTML document, returns false otherwise.

Returns:

  • (Boolean)


119
120
121
# File 'lib/polipus/page.rb', line 119

def html?
  content_type =~ %r{^(text/html|application/xhtml+xml)\b}
end

#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

  • (Boolean)


183
184
185
186
# File 'lib/polipus/page.rb', line 183

def in_domain?(uri)
  @domain_aliases ||= []
  uri.host == @url.host || @domain_aliases.include?(uri.host)
end

Array of distinct A tag HREFs from the page



63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/polipus/page.rb', line 63

def links
  return @links.to_a unless @links.nil?
  @links = Set.new
  return [] unless doc

  doc.search('//a[@href]').each do |a|
    u = a['href']
    next if u.nil? || u.empty?
    abs = to_absolute(u) rescue next
    @links << abs if in_domain?(abs)
  end
  @links.to_a
end

#not_found?Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

  • (Boolean)


143
144
145
# File 'lib/polipus/page.rb', line 143

def not_found?
  404 == @code
end

#redirect?Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

  • (Boolean)


127
128
129
# File 'lib/polipus/page.rb', line 127

def redirect?
  (300..307).include?(@code)
end

#storable?Boolean

Returns true if page is marked as storeable false otherwise Default is true

Returns:

  • (Boolean)


218
219
220
# File 'lib/polipus/page.rb', line 218

def storable?
  @storable
end

#success?Boolean

Returns true if the page is a HTTP success, returns false otherwise.

Returns:

  • (Boolean)


135
136
137
# File 'lib/polipus/page.rb', line 135

def success?
  (200..206).include?(@code)
end

#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page



165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/polipus/page.rb', line 165

def to_absolute(link)
  return nil if link.nil?

  # remove anchor
  link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))

  relative = URI(link)
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  absolute
end

#to_hashObject



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/polipus/page.rb', line 188

def to_hash
  {
    'url'           => @url.to_s,
    'headers'       => Marshal.dump(@headers),
    'body'          => @body,
    'links'         => links.map(&:to_s),
    'code'          => @code,
    'depth'         => @depth,
    'referer'       => @referer.to_s,
    'redirect_to'   => @redirect_to.to_s,
    'response_time' => @response_time,
    'fetched'       => @fetched,
    'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
    'fetched_at'    => @fetched_at,
    'error'         => @error.to_s
  }
end

#to_jsonObject



206
207
208
209
210
211
# File 'lib/polipus/page.rb', line 206

def to_json
  th = to_hash.dup
  th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
  th.delete('headers') if content_type.empty?
  th.to_json
end