Class: Mechanize::Page

Inherits:
File
  • Object
show all
Extended by:
Forwardable, ElementMatcher
Defined in:
lib/mechanize/page.rb

Overview

This class encapsulates an HTML page. If Mechanize finds a content type of 'text/html', this class will be instantiated and returned.

Example:

require 'mechanize'

agent = Mechanize.new
agent.get('http://google.com/').class # => Mechanize::Page

Defined Under Namespace

Classes: Base, Frame, Image, Label, Link, MetaRefresh

Constant Summary collapse

DEFAULT_RESPONSE =
{
  'content-type' => 'text/html',
}.freeze

Constants included from Parser

Mechanize::Parser::SPECIAL_FILENAMES

Instance Attribute Summary collapse

Attributes inherited from File

#body, #filename

Attributes included from Parser

#code, #response, #uri

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ElementMatcher

elements_with

Methods inherited from File

#save, #save!

Methods included from Parser

#extract_filename, #fill_header, #find_free_name

Constructor Details

#initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil) ⇒ Page

Returns a new instance of Page.


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/mechanize/page.rb', line 28

def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil)
  response ||= DEFAULT_RESPONSE

  @meta_content_type = nil
  @encoding = nil
  @encodings = [nil]
  raise 'no' if mech and not Mechanize === mech
  @mech = mech

  reset

  @encodings << Mechanize::Util.detect_charset(body) if body

  @encodings.concat self.class.response_header_charset(response)

  if body
    # Force the encoding to be 8BIT so we can perform regular expressions.
    # We'll set it to the detected encoding later
    body.force_encoding(Encoding::ASCII_8BIT)

    @encodings.concat self.class.meta_charset body

    meta_content_type = self.class.meta_content_type body
    @meta_content_type = meta_content_type if meta_content_type
  end

  @encodings << mech.default_encoding if mech and mech.default_encoding

  super uri, response, body, code
end

Instance Attribute Details

#encodingsObject (readonly)

Possible encodings for this page based on HTTP headers and meta elements


26
27
28
# File 'lib/mechanize/page.rb', line 26

def encodings
  @encodings
end

#mechObject

Returns the value of attribute mech


21
22
23
# File 'lib/mechanize/page.rb', line 21

def mech
  @mech
end

Class Method Details

.charset(content_type) ⇒ Object Also known as: charset_from_content_type


579
580
581
582
583
# File 'lib/mechanize/page.rb', line 579

def charset content_type
  charset = content_type[/;(?:\s*,)?\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i, 1]
  return nil if charset == 'none'
  charset
end

.meta_charset(body) ⇒ Object

Retrieves all charsets from meta tags in body


601
602
603
604
605
606
607
608
609
610
611
612
613
614
# File 'lib/mechanize/page.rb', line 601

def self.meta_charset body
  # HACK use .map
  body.scan(/<meta .*?>/i).map do |meta|
    if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
      $2
    elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content\s*=\s*(["'])?(.*?)\1/i

      m_charset = charset $2 if $2

      m_charset if m_charset
    end
  end.compact
end

.meta_content_type(body) ⇒ Object

Retrieves the last content-type set by a meta tag in body


619
620
621
622
623
624
625
626
627
628
629
# File 'lib/mechanize/page.rb', line 619

def self.meta_content_type body
  body.scan(/<meta .*?>/i).reverse.map do |meta|
    if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content=(["'])?(.*?)\1/i

      return $2
    end
  end

  nil
end

.response_header_charset(response) ⇒ Object


588
589
590
591
592
593
594
595
596
# File 'lib/mechanize/page.rb', line 588

def self.response_header_charset response
  charsets = []
  response.each do |header, value|
    next unless header == 'content-type'
    next unless value =~ /charset/i
    charsets << charset(value)
  end
  charsets
end

Instance Method Details

#baseObject

:method: bases_with

:call-seq: bases_with(criteria)

Find all base tags matching criteria. See forms_with for details of criteria, where for “form(s)” read “base tag(s)”.

Example:

page.bases_with(href: /foo/).each do |base|
  puts base.href
end

381
# File 'lib/mechanize/page.rb', line 381

elements_with :base

#basesObject

Return a list of all base tags


530
531
532
533
# File 'lib/mechanize/page.rb', line 530

def bases
  @bases ||=
    search('base').map { |node| Base.new(node, @mech, self) }
end

#canonical_uriObject

Return the canonical URI for the page if there is a link tag with href=“canonical”.


182
183
184
185
186
187
188
189
190
# File 'lib/mechanize/page.rb', line 182

def canonical_uri
  link = at('link[@rel="canonical"][@href]')
  return unless link
  href = link['href']

  URI href
rescue URI::InvalidURIError
  URI Mechanize::Util.uri_escape href
end

#content_typeObject

Get the content type


193
194
195
# File 'lib/mechanize/page.rb', line 193

def content_type
  @meta_content_type || response['content-type']
end

#detected_encodingObject


75
76
77
# File 'lib/mechanize/page.rb', line 75

def detected_encoding
  Mechanize::Util.detect_charset(body)
end

#encodingObject


95
96
97
98
99
# File 'lib/mechanize/page.rb', line 95

def encoding
  parser.encoding
rescue NoMethodError
  nil
end

#encoding=(encoding) ⇒ Object


79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/mechanize/page.rb', line 79

def encoding=(encoding)
  reset

  @encoding = encoding

  if @parser
    parser_encoding = @parser.encoding
    if parser_encoding && encoding && parser_encoding.casecmp(encoding) != 0
      # lazy reinitialize the parser with the new encoding
      @parser = nil
    end
  end

  encoding
end

#encoding_error?(parser = nil) ⇒ Boolean

Return whether parser result has errors related to encoding or not. false indicates just parser has no encoding errors, not encoding is vaild.

Returns:

  • (Boolean)

103
104
105
106
107
108
109
110
111
# File 'lib/mechanize/page.rb', line 103

def encoding_error?(parser=nil)
  parser = self.parser unless parser
  return false if parser.errors.empty?
  parser.errors.any? do |error|
    error.message.scrub =~ /(indicate\ encoding)|
                            (Invalid\ char)|
                            (input\ conversion\ failed)/x
  end
end

#formObject

:method: forms_with

:call-seq:

forms_with(name)
forms_with(name: name_matcher, id: id_matcher, class: class_matcher,
           search: search_expression, xpath: xpath_expression, css: css_expression,
           action: action_matcher, ...)

Find all forms form matching criteria. If a string is given, it is taken as a name attribute value. If a hash is given, forms are narrowed by the key-value pairs as follows.

:id, :dom_id: selects forms with a #dom_id value that matches this value.

:class, :dom_class: selects forms with a #dom_class value that matches this value. Note that class attribute values are compared literally as string, so forms_with(class: “a”) does not match a form with class=“a b”. Use forms_with(css: “form.a”) instead.

:search: only selects forms matching this selector expression.

:xpath: only selects forms matching this XPath expression.

:css: only selects forms matching this CSS selector expression.

:action, :method, etc.: narrows forms by a given attribute value using the === operator.

Example:

page.forms_with(css: '#content table.login_box form', method: /\APOST\z/i, ).each do |f|
  ...
end

304
# File 'lib/mechanize/page.rb', line 304

elements_with :form

#formsObject

Return a list of all form tags


509
510
511
512
513
514
515
# File 'lib/mechanize/page.rb', line 509

def forms
  @forms ||= search('form').map do |html_form|
    form = Mechanize::Form.new(html_form, @mech, self)
    form.action ||= @uri.to_s
    form
  end
end

#frameObject

:method: frames_with

:call-seq: frames_with(criteria)

Find all frame tags matching criteria. See forms_with for details of criteria, where for “form(s)” read “frame tag(s)”.

Example:

page.frames_with(src: /foo/).each do |frame|
  p frame.src
end

419
# File 'lib/mechanize/page.rb', line 419

elements_with :frame

#framesObject

Return a list of all frame tags


537
538
539
540
# File 'lib/mechanize/page.rb', line 537

def frames
  @frames ||=
    search('frame').map { |node| Frame.new(node, @mech, self) }
end

#iframeObject

:method: iframes_with

:call-seq: iframes_with(criteria)

Find all iframe tags matching criteria. See forms_with for details of criteria, where for “form(s)” read “iframe tag(s)”.

Example:

page.iframes_with(src: /foo/).each do |iframe|
  p iframe.src
end

457
# File 'lib/mechanize/page.rb', line 457

elements_with :iframe

#iframesObject

Return a list of all iframe tags


544
545
546
547
# File 'lib/mechanize/page.rb', line 544

def iframes
  @iframes ||=
    search('iframe').map { |node| Frame.new(node, @mech, self) }
end

#imageObject

:method: images_with

:call-seq: images_with(criteria)

Find all images matching criteria. See forms_with for details of criteria, where for “form(s)” read “image(s)”.

Example:

page.images_with(src: /jpg\Z/).each do |img|
  img.fetch.save
end

495
# File 'lib/mechanize/page.rb', line 495

elements_with :image

#image_urlsObject


556
557
558
# File 'lib/mechanize/page.rb', line 556

def image_urls
  @image_urls ||= images.map(&:url).uniq
end

#imagesObject

Return a list of all img tags


551
552
553
554
# File 'lib/mechanize/page.rb', line 551

def images
  @images ||=
    search('img').map { |node| Image.new(node, self) }
end

#labelsObject

Return a list of all label tags


562
563
564
565
# File 'lib/mechanize/page.rb', line 562

def labels
  @labels ||=
    search('label').map { |node| Label.new(node, self) }
end

#labels_hashObject


567
568
569
570
571
572
573
574
575
576
# File 'lib/mechanize/page.rb', line 567

def labels_hash
  unless @labels_hash
    hash = {}
    labels.each do |label|
      hash[label.node['for']] = label if label.for
    end
    @labels_hash = hash
  end
  return @labels_hash
end

:method: links_with

:call-seq:

links_with(criteria)

Find all links matching criteria. See forms_with for details of criteria, where for “form(s)” read “link(s)”.

Example:

page.links_with(href: /foo/).each do |link|
  puts link.href
end

343
# File 'lib/mechanize/page.rb', line 343

elements_with :link

Return a list of all link and area tags


499
500
501
502
503
504
505
# File 'lib/mechanize/page.rb', line 499

def links
  @links ||= %w{ a area }.map do |tag|
    search(tag).map do |node|
      Link.new(node, @mech, self)
    end
  end.flatten
end

#meta_charsetObject


71
72
73
# File 'lib/mechanize/page.rb', line 71

def meta_charset
  self.class.meta_charset(body)
end

#meta_refreshObject

Return a list of all meta refresh elements


520
521
522
523
524
525
526
# File 'lib/mechanize/page.rb', line 520

def meta_refresh
  query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'

  @meta_refresh ||= search(query).map do |node|
    MetaRefresh.from_node node, self
  end.compact
end

#parserObject Also known as: root

:method: at_xpath

Shorthand for parser.at_xpath.

See also Nokogiri::XML::Node#at_xpath for details.


239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/mechanize/page.rb', line 239

def parser
  return @parser if @parser
  return unless @body

  url = @uri && @uri.to_s

  if @encoding
    @parser = mech.html_parser.parse html_body, url, @encoding
  elsif mech.force_default_encoding
    @parser = mech.html_parser.parse html_body, url, @mech.default_encoding
  else
    @encodings.reverse_each do |encoding|
      @parser = mech.html_parser.parse html_body, url, encoding

      break unless encoding_error? @parser
    end
  end

  @parser
end

#pretty_print(q) ⇒ Object

:nodoc:


136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/mechanize/page.rb', line 136

def pretty_print(q) # :nodoc:
  q.object_group(self) {
    q.breakable
    q.group(1, '{url', '}') {q.breakable; q.pp uri }
    q.breakable
    q.group(1, '{meta_refresh', '}') {
      meta_refresh.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{title', '}') { q.breakable; q.pp title }
    q.breakable
    q.group(1, '{iframes', '}') {
      iframes.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{frames', '}') {
      frames.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{links', '}') {
      links.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{forms', '}') {
      forms.each { |form| q.breakable; q.pp form }
    }
  }
end

#resetObject


167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/mechanize/page.rb', line 167

def reset
  @bases = nil
  @forms = nil
  @frames = nil
  @iframes = nil
  @links = nil
  @labels = nil
  @labels_hash = nil
  @meta_refresh = nil
  @parser = nil
  @title = nil
end

#response_header_charsetObject


67
68
69
# File 'lib/mechanize/page.rb', line 67

def response_header_charset
  self.class.response_header_charset(response)
end

#titleObject


59
60
61
62
63
64
65
# File 'lib/mechanize/page.rb', line 59

def title
  @title ||=
    if doc = parser
      title = doc.xpath('string(((/html/head | /html | /head | /)/title)[1])').to_s
      title.empty? ? nil : title
    end
end