Class: Mechanize::Page

Inherits:
File
  • Object
show all
Extended by:
Forwardable, ElementMatcher
Defined in:
lib/mechanize/page.rb,
lib/mechanize/inspect.rb,
lib/mechanize/page/base.rb,
lib/mechanize/page/image.rb,
lib/mechanize/page/label.rb,
lib/mechanize/monkey_patch.rb

Overview

This class encapsulates an HTML page. If Mechanize finds a content type of ‘text/html’, this class will be instantiated and returned.

Example:

require 'mechanize'

agent = Mechanize.new
agent.get('http://google.com/').class # => Mechanize::Page

Defined Under Namespace

Classes: Base, Frame, Image, Label, Link, MetaRefresh

Instance Attribute Summary collapse

Attributes inherited from File

#body, #code, #filename, #response, #uri

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ElementMatcher

elements_with

Methods inherited from File

#save_as

Constructor Details

#initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil) ⇒ Page

Returns a new instance of Page.



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/mechanize/page.rb', line 23

def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil)
  raise Mechanize::ContentTypeError, response['content-type'] unless
    response['content-type'] =~ /^(text\/html)|(application\/xhtml\+xml)/i

  @meta_content_type = nil
  @encoding = nil
  @encodings = [nil]
  raise 'no' if mech and not Mechanize === mech
  @mech = mech

  reset

  @encodings << Mechanize::Util.detect_charset(body) if body

  @encodings.concat self.class.response_header_charset(response)

  if body
    # Force the encoding to be 8BIT so we can perform regular expressions.
    # We'll set it to the detected encoding later
    body.force_encoding 'ASCII-8BIT' if body.respond_to? :force_encoding

    @encodings.concat self.class.meta_charset body

    meta_content_type = self.class.meta_content_type body
    @meta_content_type = meta_content_type if meta_content_type
  end

  @encodings << mech.default_encoding if mech and mech.default_encoding

  super uri, response, body, code
end

Instance Attribute Details

#encodingsObject (readonly)

Possible encodings for this page based on HTTP headers and meta elements



21
22
23
# File 'lib/mechanize/page.rb', line 21

def encodings
  @encodings
end

#mechObject

Returns the value of attribute mech.



16
17
18
# File 'lib/mechanize/page.rb', line 16

def mech
  @mech
end

Class Method Details

.charset(content_type) ⇒ Object



336
337
338
339
340
# File 'lib/mechanize/page.rb', line 336

def self.charset content_type
  charset = content_type[/charset=([^; ]+)/i, 1]
  return nil if charset == 'none'
  charset
end

.meta_charset(body) ⇒ Object

Retrieves all charsets from meta tags in body



354
355
356
357
358
359
360
361
362
363
364
365
366
367
# File 'lib/mechanize/page.rb', line 354

def self.meta_charset body
  # HACK use .map
  body.scan(/<meta .*?>/i).map do |meta|
    if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
      $2
    elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content\s*=\s*(["'])?(.*?)\1/i

      m_charset = charset $2

      m_charset if m_charset
    end
  end.compact
end

.meta_content_type(body) ⇒ Object

Retrieves the last content-type set by a meta tag in body



372
373
374
375
376
377
378
379
380
381
382
# File 'lib/mechanize/page.rb', line 372

def self.meta_content_type body
  body.scan(/<meta .*?>/i).reverse.map do |meta|
    if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content=(["'])?(.*?)\1/i

      return $2
    end
  end

  nil
end

.response_header_charset(response) ⇒ Object



342
343
344
345
346
347
348
349
# File 'lib/mechanize/page.rb', line 342

def self.response_header_charset response
  charsets = []
  response.each do |header, value|
    next unless value =~ /charset/i
    charsets << charset(value)
  end
  charsets
end

Instance Method Details

#baseObject

:method: bases_with(criteria)

Find all base tags matching criteria. Example:

page.bases_with(:href => /foo/).each do |base|
  puts base.href
end


217
# File 'lib/mechanize/page.rb', line 217

elements_with :base

#basesObject

Return a list of all base tags



288
289
290
291
# File 'lib/mechanize/page.rb', line 288

def bases
  @bases ||=
    search('base').map { |node| Base.new(node, @mech, self) }
end

#canonical_uriObject

Return the canonical URI for the page if there is a link tag with href=“canonical”.



143
144
145
146
147
148
149
150
151
# File 'lib/mechanize/page.rb', line 143

def canonical_uri
  link = at('link[@rel="canonical"][@href]')
  return unless link
  href = link['href']

  URI href
rescue URI::InvalidURIError
  URI Mechanize::Util.uri_escape href
end

#content_typeObject

Get the content type



154
155
156
# File 'lib/mechanize/page.rb', line 154

def content_type
  @meta_content_type || response['content-type']
end

#detected_encodingObject



71
72
73
# File 'lib/mechanize/page.rb', line 71

def detected_encoding
  Mechanize::Util.detect_charset(body)
end

#encodingObject



91
92
93
# File 'lib/mechanize/page.rb', line 91

def encoding
  parser.respond_to?(:encoding) ? parser.encoding : nil
end

#encoding=(encoding) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/mechanize/page.rb', line 75

def encoding=(encoding)
  reset

  @encoding = encoding

  if @parser
    parser_encoding = @parser.encoding
    if (parser_encoding && parser_encoding.downcase) != (encoding && encoding.downcase)
      # lazy reinitialize the parser with the new encoding
      @parser = nil
    end
  end

  encoding
end

#encoding_error?(parser = nil) ⇒ Boolean

Return whether parser result has errors related to encoding or not. false indicates just parser has no encoding errors, not encoding is vaild.

Returns:

  • (Boolean)


97
98
99
100
101
102
103
104
105
# File 'lib/mechanize/page.rb', line 97

def encoding_error?(parser=nil)
  parser = self.parser unless parser
  return false if parser.errors.empty?
  parser.errors.any? do |error|
    error.message =~ /(indicate\ encoding)|
                      (Invalid\ char)|
                      (input\ conversion\ failed)/x
  end
end

#formObject

:method: forms_with(criteria)

Find all forms form matching criteria. Example:

page.forms_with(:action => '/post/login.php').each do |f|
  ...
end


181
# File 'lib/mechanize/page.rb', line 181

elements_with :form

#formsObject

Return a list of all form tags



267
268
269
270
271
272
273
# File 'lib/mechanize/page.rb', line 267

def forms
  @forms ||= search('form').map do |html_form|
    form = Mechanize::Form.new(html_form, @mech, self)
    form.action ||= @uri.to_s
    form
  end
end

#frameObject

:method: frames_with(criteria)

Find all frame tags matching criteria. Example:

page.frames_with(:src => /foo/).each do |frame|
  p frame.src
end


235
# File 'lib/mechanize/page.rb', line 235

elements_with :frame

#framesObject

Return a list of all frame tags



295
296
297
298
# File 'lib/mechanize/page.rb', line 295

def frames
  @frames ||=
    search('frame').map { |node| Frame.new(node, @mech, self) }
end

#iframeObject

:method: iframes_with(criteria)

Find all iframe tags matching criteria. Example:

page.iframes_with(:src => /foo/).each do |iframe|
  p iframe.src
end


253
# File 'lib/mechanize/page.rb', line 253

elements_with :iframe

#iframesObject

Return a list of all iframe tags



302
303
304
305
# File 'lib/mechanize/page.rb', line 302

def iframes
  @iframes ||=
    search('iframe').map { |node| Frame.new(node, @mech, self) }
end

#image_urlsObject



314
315
316
# File 'lib/mechanize/page.rb', line 314

def image_urls
  @image_urls ||= images.map(&:url).uniq
end

#imagesObject

Return a list of all img tags



309
310
311
312
# File 'lib/mechanize/page.rb', line 309

def images
  @images ||=
    search('img').map { |node| Image.new(node, self) }
end

#labelsObject

Return a list of all label tags



320
321
322
323
# File 'lib/mechanize/page.rb', line 320

def labels
  @labels ||=
    search('label').map { |node| Label.new(node, self) }
end

#labels_hashObject



325
326
327
328
329
330
331
332
333
334
# File 'lib/mechanize/page.rb', line 325

def labels_hash
  unless @labels_hash
    hash = {}
    labels.each do |label|
      hash[label.node['for']] = label if label.for
    end
    @labels_hash = hash
  end
  return @labels_hash
end

:method: links_with(criteria)

Find all links matching criteria. Example:

page.links_with(:href => /foo/).each do |link|
  puts link.href
end


199
# File 'lib/mechanize/page.rb', line 199

elements_with :link

Return a list of all link and area tags



257
258
259
260
261
262
263
# File 'lib/mechanize/page.rb', line 257

def links
  @links ||= %w{ a area }.map do |tag|
    search(tag).map do |node|
      Link.new(node, @mech, self)
    end
  end.flatten
end

#meta_charsetObject



67
68
69
# File 'lib/mechanize/page.rb', line 67

def meta_charset
  self.class.meta_charset(body)
end

#meta_refreshObject

Return a list of all meta refresh elements



278
279
280
281
282
283
284
# File 'lib/mechanize/page.rb', line 278

def meta_refresh
  query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'

  @meta_refresh ||= search(query).map do |node|
    MetaRefresh.from_node node, self, uri
  end.compact
end

#parserObject Also known as: root



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/mechanize/page.rb', line 107

def parser
  return @parser if @parser
  return nil unless @body

  if @encoding then
    @parser = @mech.html_parser.parse html_body, nil, @encoding
  elsif mech.force_default_encoding then
    @parser = @mech.html_parser.parse html_body, nil, @mech.default_encoding
  else
    @encodings.reverse_each do |encoding|
      @parser = @mech.html_parser.parse html_body, nil, encoding

      break unless encoding_error? @parser
    end
  end

  @parser
end

#pretty_print(q) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/mechanize/inspect.rb', line 15

def pretty_print(q)
  q.object_group(self) {
    q.breakable
    q.group(1, '{url', '}') {q.breakable; q.pp uri }
    q.breakable
    q.group(1, '{meta_refresh', '}') {
      meta_refresh.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{title', '}') { q.breakable; q.pp title }
    q.breakable
    q.group(1, '{iframes', '}') {
      iframes.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{frames', '}') {
      frames.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{links', '}') {
      links.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{forms', '}') {
      forms.each { |form| q.breakable; q.pp form }
    }
  }
end

#resetObject



128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/mechanize/page.rb', line 128

def reset
  @bases = nil
  @forms = nil
  @frames = nil
  @iframes = nil
  @links = nil
  @labels = nil
  @labels_hash = nil
  @meta_refresh = nil
  @parser = nil
  @title = nil
end

#response_header_charsetObject



63
64
65
# File 'lib/mechanize/page.rb', line 63

def response_header_charset
  self.class.response_header_charset(response)
end

#titleObject



55
56
57
58
59
60
61
# File 'lib/mechanize/page.rb', line 55

def title
  @title ||=
    if doc = parser
      title = doc.search('title').inner_text
      title.empty? ? nil : title
    end
end