Class: Polipus::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/polipus/page.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, params = {}) ⇒ Page

Create a new page



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/polipus/page.rb', line 44

def initialize(url, params = {})
  @url = URI(url)
  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]
  @fetched = !params[:code].nil?
  @user_data = OpenStruct.new
  @domain_aliases = params[:domain_aliases] ||= []
  @storable = true
  @fetched_at = params[:fetched_at]
end

Instance Attribute Details

#aliasesObject

Returns the value of attribute aliases.



31
32
33
# File 'lib/polipus/page.rb', line 31

def aliases
  @aliases
end

#bodyObject (readonly)

The raw HTTP response body of the page



13
14
15
# File 'lib/polipus/page.rb', line 13

def body
  @body
end

#codeObject

Integer response code of the page



21
22
23
# File 'lib/polipus/page.rb', line 21

def code
  @code
end

#depthObject

Depth of this page from the root of the crawl.



23
24
25
# File 'lib/polipus/page.rb', line 23

def depth
  @depth
end

#domain_aliasesObject

Returns the value of attribute domain_aliases.



33
34
35
# File 'lib/polipus/page.rb', line 33

def domain_aliases
  @domain_aliases
end

#errorObject (readonly)

Exception object, if one was raised during HTTP#fetch_page



19
20
21
# File 'lib/polipus/page.rb', line 19

def error
  @error
end

#fetched_atObject

Returns the value of attribute fetched_at.



39
40
41
# File 'lib/polipus/page.rb', line 39

def fetched_at
  @fetched_at
end

#headersObject (readonly)

Headers of the HTTP response



15
16
17
# File 'lib/polipus/page.rb', line 15

def headers
  @headers
end

#redirect_toObject (readonly)

URL of the page this one redirected to, if any



17
18
19
# File 'lib/polipus/page.rb', line 17

def redirect_to
  @redirect_to
end

#refererObject

URL of the page that brought us to this page



25
26
27
# File 'lib/polipus/page.rb', line 25

def referer
  @referer
end

#response_timeObject

Response time of the request for this page in milliseconds



27
28
29
# File 'lib/polipus/page.rb', line 27

def response_time
  @response_time
end

#storableObject

Whether the current page should be stored Default: true



37
38
39
# File 'lib/polipus/page.rb', line 37

def storable
  @storable
end

#urlObject (readonly)

The URL of the page



11
12
13
# File 'lib/polipus/page.rb', line 11

def url
  @url
end

#user_dataObject

OpenStruct it holds users defined data



29
30
31
# File 'lib/polipus/page.rb', line 29

def user_data
  @user_data
end

Class Method Details

.from_hash(hash) ⇒ Object



245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# File 'lib/polipus/page.rb', line 245

def self.from_hash(hash)
  page = new(URI(hash['url']))
  {
    '@headers'       => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
    '@body'          => hash['body'],
    '@links'         => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
    '@code'          => hash['code'].to_i,
    '@depth'         => hash['depth'].to_i,
    '@referer'       => hash['referer'],
    '@redirect_to'   => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
    '@response_time' => hash['response_time'].to_i,
    '@fetched'       => hash['fetched'],
    '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
    '@fetched_at'    => hash['fetched_at'],
    '@error'         => hash['error']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end

.from_json(json) ⇒ Object



266
267
268
269
# File 'lib/polipus/page.rb', line 266

def self.from_json(json)
  hash = JSON.parse json
  from_hash hash
end

Instance Method Details

#baseObject

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE



157
158
159
160
161
162
163
164
165
# File 'lib/polipus/page.rb', line 157

def base
  @base = if doc
            href = doc.search('//head/base/@href')
            URI(href.to_s) unless href.nil? rescue nil
          end unless @base

  return nil if @base && @base.to_s.empty?
  @base
end

#content_typeObject

The content-type returned by the HTTP request for this page



117
118
119
# File 'lib/polipus/page.rb', line 117

def content_type
  headers['content-type'].first
end

#discard_doc!Object

Delete the Nokogiri document and response body to conserve memory



101
102
103
104
# File 'lib/polipus/page.rb', line 101

def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end

#discard_links!Object

Discard links, a next call of page.links will return an empty array



94
95
96
# File 'lib/polipus/page.rb', line 94

def discard_links!
  @links = []
end

#docObject

Nokogiri document for the HTML body



83
84
85
86
87
88
89
# File 'lib/polipus/page.rb', line 83

def doc
  return @doc if @doc
  @body ||= ''
  @body = @body.encode('utf-8', 'binary', invalid: :replace,
                                          undef: :replace, replace: '')
  @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
end

#expired?(ttl) ⇒ Boolean

Returns:

  • (Boolean)


240
241
242
243
# File 'lib/polipus/page.rb', line 240

def expired?(ttl)
  return false if fetched_at.nil?
  (Time.now.to_i - ttl) > fetched_at
end

#fetched?Boolean

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

Returns:

  • (Boolean)


110
111
112
# File 'lib/polipus/page.rb', line 110

def fetched?
  @fetched
end

#html?Boolean

Returns true if the page is a HTML document, returns false otherwise.

Returns:

  • (Boolean)


125
126
127
# File 'lib/polipus/page.rb', line 125

def html?
  content_type =~ %r{^(text/html|application/xhtml+xml)\b}
end

#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

  • (Boolean)


201
202
203
204
# File 'lib/polipus/page.rb', line 201

def in_domain?(uri)
  @domain_aliases ||= []
  uri.host == @url.host || @domain_aliases.include?(uri.host)
end

Array of distinct A tag HREFs from the page



66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/polipus/page.rb', line 66

def links
  return @links.to_a unless @links.nil?
  @links = Set.new
  return [] unless doc

  doc.search('//a[@href]').each do |a|
    u = a['href']
    next if u.nil? || u.empty?
    abs = to_absolute(u) rescue next
    @links << abs if abs && in_domain?(abs)
  end
  @links.to_a
end

#not_found?Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

  • (Boolean)


149
150
151
# File 'lib/polipus/page.rb', line 149

def not_found?
  404 == @code
end

#redirect?Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

  • (Boolean)


133
134
135
# File 'lib/polipus/page.rb', line 133

def redirect?
  (300...400).include?(@code)
end

#storable?Boolean

Returns true if page is marked as storeable false otherwise Default is true

Returns:

  • (Boolean)


236
237
238
# File 'lib/polipus/page.rb', line 236

def storable?
  @storable
end

#success?Boolean

Returns true if the page is a HTTP success, returns false otherwise.

Returns:

  • (Boolean)


141
142
143
# File 'lib/polipus/page.rb', line 141

def success?
  (200..206).include?(@code)
end

#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page



171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/polipus/page.rb', line 171

def to_absolute(link)
  return nil if link.nil?

  valid_link = link.to_s.encode('utf-8', 'binary', invalid: :replace,
                                                   undef: :replace, replace: '')

  # remove anchor
  link =
    begin
      URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
    rescue URI::Error
      return nil
    end

  relative = begin
               URI(link)
             rescue URI::Error
               return nil
             end
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  absolute
end

#to_hashObject



206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/polipus/page.rb', line 206

def to_hash
  {
    'url'           => @url.to_s,
    'headers'       => Marshal.dump(@headers),
    'body'          => @body,
    'links'         => links.map(&:to_s),
    'code'          => @code,
    'depth'         => @depth,
    'referer'       => @referer.to_s,
    'redirect_to'   => @redirect_to.to_s,
    'response_time' => @response_time,
    'fetched'       => @fetched,
    'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
    'fetched_at'    => @fetched_at,
    'error'         => @error.to_s
  }
end

#to_jsonObject



224
225
226
227
228
229
# File 'lib/polipus/page.rb', line 224

def to_json
  th = to_hash.dup
  th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
  th.delete('headers') if content_type.empty?
  th.to_json
end