Class: HTML5::HTMLParser

Inherits:
Object
  • Object
show all
Defined in:
lib/html5/html5parser.rb

Overview

HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML

Direct Known Subclasses

XMLParser

Constant Summary collapse

@@phases =
%w( initial beforeHtml beforeHead inHead afterHead inBody inTable inCaption
inColumnGroup inTableBody inRow inCell inSelect inSelectInTable afterBody inFrameset
afterFrameset afterAfterBody afterAfterFrameset inForeignContent)
@@new_modes =
{
  'select'   => :inSelect,
  'td'       => :inCell,
  'th'       => :inCell,
  'tr'       => :inRow,
  'tbody'    => :inTableBody,
  'thead'    => :inTableBody,
  'tfoot'    => :inTableBody,
  'caption'  => :inCaption,
  'colgroup' => :inColumnGroup,
  'table'    => :inTable,
  'head'     => :inBody,
  'body'     => :inBody,
  'frameset' => :inFrameset
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ HTMLParser

:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through HTML5::TreeBuilders



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/html5/html5parser.rb', line 42

def initialize(options = {})
  @strict = false
  @errors = []
 
  @tokenizer =  HTMLTokenizer
  @tree = TreeBuilders::REXML::TreeBuilder

  options.each {|name, value| instance_variable_set("@#{name}", value) }
  @lowercase_attr_name    = nil unless instance_variable_defined?("@lowercase_attr_name")
  @lowercase_element_name = nil unless instance_variable_defined?("@lowercase_element_name")

  @tree = @tree.new

  @phases = @@phases.inject({}) do |phases, phase_name|
    phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
    phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
    phases
  end
end

Instance Attribute Details

#errorsObject (readonly)

Returns the value of attribute errors.



21
22
23
# File 'lib/html5/html5parser.rb', line 21

def errors
  @errors
end

#first_start_tagObject

Returns the value of attribute first_start_tag.



19
20
21
# File 'lib/html5/html5parser.rb', line 19

def first_start_tag
  @first_start_tag
end

#inner_htmlObject

Returns the value of attribute inner_html.



19
20
21
# File 'lib/html5/html5parser.rb', line 19

def inner_html
  @inner_html
end

#insert_from_tableObject

Returns the value of attribute insert_from_table.



19
20
21
# File 'lib/html5/html5parser.rb', line 19

def insert_from_table
  @insert_from_table
end

#last_phaseObject

Returns the value of attribute last_phase.



19
20
21
# File 'lib/html5/html5parser.rb', line 19

def last_phase
  @last_phase
end

#phaseObject

Returns the value of attribute phase.



19
20
21
# File 'lib/html5/html5parser.rb', line 19

def phase
  @phase
end

#phasesObject (readonly)

Returns the value of attribute phases.



21
22
23
# File 'lib/html5/html5parser.rb', line 21

def phases
  @phases
end

#secondary_phaseObject

Returns the value of attribute secondary_phase.



19
20
21
# File 'lib/html5/html5parser.rb', line 19

def secondary_phase
  @secondary_phase
end

#tokenizerObject (readonly)

Returns the value of attribute tokenizer.



21
22
23
# File 'lib/html5/html5parser.rb', line 21

def tokenizer
  @tokenizer
end

#treeObject (readonly)

Returns the value of attribute tree.



21
22
23
# File 'lib/html5/html5parser.rb', line 21

def tree
  @tree
end

Class Method Details

.parse(stream, options = {}) ⇒ Object



23
24
25
26
# File 'lib/html5/html5parser.rb', line 23

def self.parse(stream, options = {})
  encoding = options.delete(:encoding)
  new(options).parse(stream,encoding)
end

.parse_fragment(stream, options = {}) ⇒ Object



28
29
30
31
32
# File 'lib/html5/html5parser.rb', line 28

def self.parse_fragment(stream, options = {})
  container = options.delete(:container) || 'div'
  encoding = options.delete(:encoding)
  new(options).parse_fragment(stream, container, encoding)
end

Instance Method Details

#_(string) ⇒ Object



244
# File 'lib/html5/html5parser.rb', line 244

def _(string); string; end

#_parse(stream, inner_html, encoding, container = 'div') ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/html5/html5parser.rb', line 62

def _parse(stream, inner_html, encoding, container = 'div')
  @tree.reset
  @first_start_tag = false
  @errors = []

  @tokenizer = @tokenizer.class unless Class === @tokenizer
  @tokenizer = @tokenizer.new(stream, :encoding => encoding,
    :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)

  if inner_html
    case @inner_html = container.downcase
    when 'title', 'textarea'
      @tokenizer.content_model_flag = :RCDATA
    when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
      @tokenizer.content_model_flag = :CDATA
    when 'plaintext'
      @tokenizer.content_model_flag = :PLAINTEXT
    else
      # content_model_flag already is PCDATA
      @tokenizer.content_model_flag = :PCDATA
    end
  
    @phase = @phases[:beforeHtml]
    @phase.insert_html_element
    reset_insertion_mode
  else
    @inner_html = false
    @phase = @phases[:initial]
  end

  # We only seem to have InBodyPhase testcases where the following is
  # relevant ... need others too
  @last_phase = nil

  @tokenizer.each do |token|
    token = normalize_token(token)

    method = 'process%s' % token[:type]

    case token[:type]
    when :Characters, :SpaceCharacters, :Comment
      @phase.send method, token[:data]
    when :StartTag
      @phase.send method, token[:name], token[:data], token[:self_closing]
    when :EndTag
      @phase.send method, token[:name]
    when :Doctype
      @phase.send method, token[:name], token[:publicId],
        token[:systemId], token[:correct]
    else
      parse_error(token[:data], token[:datavars])
    end
  end

  # When the loop finishes it's EOF
  @phase.process_eof
end

#normalize_token(token) ⇒ Object

HTML5 specific normalizations to the token stream



156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/html5/html5parser.rb', line 156

def normalize_token(token)

  if token[:type] == :EmptyTag
    # When a solidus (/) is encountered within a tag name what happens
    # depends on whether the current tag name matches that of a void
    # element.  If it matches a void element atheists did the wrong
    # thing and if it doesn't it's wrong for everyone.

    unless VOID_ELEMENTS.include?(token[:name])
      parse_error("incorrectly-placed-solidus")
    end

    token[:type] = :StartTag
  end

  if token[:type] == :StartTag
    token[:name] = token[:name].downcase

    # We need to remove the duplicate attributes and convert attributes
    # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}

    unless token[:data].empty?
      data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
      token[:data] = Hash[*data.flatten]
    end

  elsif token[:type] == :EndTag
    parse_error("attributes-in-end-tag") unless token[:data].empty?
    token[:name] = token[:name].downcase
  end

  token
end

#parse(stream, encoding = nil) ⇒ Object

Parse a HTML document into a well-formed tree

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)



128
129
130
131
# File 'lib/html5/html5parser.rb', line 128

def parse(stream, encoding=nil)
  _parse(stream, false, encoding)
  @tree.get_document
end

#parse_error(code = 'XXX-undefined-error', data = {}) ⇒ Object

Raises:



149
150
151
152
153
# File 'lib/html5/html5parser.rb', line 149

def parse_error(code = 'XXX-undefined-error', data = {})
  # XXX The idea is to make data mandatory.
  @errors.push([@tokenizer.stream.position, code, data])
  raise ParseError if @strict
end

#parse_fragment(stream, container = 'div', encoding = nil) ⇒ Object

container - name of the element we’re setting the inner_html property if set to nil, default to ‘div’

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)



144
145
146
147
# File 'lib/html5/html5parser.rb', line 144

def parse_fragment(stream, container='div', encoding=nil)
  _parse(stream, true, encoding, container)
  @tree.get_fragment
end

#reset_insertion_modeObject



206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/html5/html5parser.rb', line 206

def reset_insertion_mode
  # The name of this method is mostly historical. (It's also used in the
  # specification.)
  last = false

  @tree.open_elements.reverse.each do |node|
    node_name = node.name

    if node == @tree.open_elements.first
      last = true
      unless ['td', 'th'].include?(node_name)
        # XXX
        # assert @inner_html
        node_name = @inner_html
      end
    end

    # Check for conditions that should only happen in the inner_html
    # case
    if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
      # XXX
      # assert @inner_html
    end

    if @@new_modes.has_key?(node_name)
      @phase = @phases[@@new_modes[node_name]]
    elsif node_name == 'html'
      @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
    elsif last
      @phase = @phases[:inBody]
    else
      next
    end

    break
  end
end