Class: HTML5::HTMLInputStream

Inherits:
Object
  • Object
show all
Defined in:
lib/html5/inputstream.rb

Overview

This class takes care of character encoding and removing or replacing incorrect byte-sequences and also provides column and line tracking.

Constant Summary collapse

VALID_CHAR =

see /usr/lib/ruby/1.9.1/rexml/text.rb

[
  0x9, 0xA, 0xD,
  (0x20..0xD7FF),
  (0xE000..0xFFFD),
  (0x10000..0x10FFFF)
]
VALID_XML_CHARS =
/^(
  | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
  |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
  | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
  |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
  |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
  | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
  |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
)*$/nx

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source, options = {}) ⇒ HTMLInputStream

Initialises the HTMLInputStream.

HTMLInputStream(source, [encoding]) -> Normalized stream from source for use by the HTML5Lib.

source can be either a file-object, local filename or a string.

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

parseMeta - Look for a <meta> element containing encoding information



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/html5/inputstream.rb', line 59

def initialize(source, options = {})
  @encoding   = nil
  @parse_meta = true
  @chardet    = true

  options.each {|name, value| instance_variable_set("@#{name}", value) }

  # partial Ruby 1.9 support
  if @encoding and source.respond_to? :force_encoding
    source.force_encoding(@encoding) rescue nil
  end

  # Raw Stream
  @raw_stream = open_stream(source)

  # Encoding Information
  #Number of bytes to use when looking for a meta element with
  #encoding information
  @NUM_BYTES_META = 512
  #Number of bytes to use when using detecting encoding using chardet
  @NUM_BYTES_CHARDET = 256
  #Number of bytes to use when reading content
  @NUM_BYTES_BUFFER = 1024

  #Encoding to use if no other information can be found
  @DEFAULT_ENCODING = 'windows-1252'

  #Detect encoding iff no explicit "transport level" encoding is supplied
  if @encoding.nil?
    @char_encoding = detect_encoding
  else
    @char_encoding = @encoding
  end

  # Read bytes from stream decoding them into Unicode
  @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
  if @char_encoding == 'windows-1252'
    @win1252 = true
  elsif @char_encoding != 'utf-8'
    require 'iconv'
    begin
      @buffer << @raw_stream.read unless @raw_stream.eof?
      @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
    rescue
      @win1252 = true
    end
  end

  @queue = []
  @errors = []

  # Reset position in the list to read from
  @tell = 0
  @line = @col = 0
  @line_lengths = []
end

Instance Attribute Details

#char_encodingObject

Returns the value of attribute char_encoding.



13
14
15
# File 'lib/html5/inputstream.rb', line 13

def char_encoding
  @char_encoding
end

#errorsObject

Returns the value of attribute errors.



13
14
15
# File 'lib/html5/inputstream.rb', line 13

def errors
  @errors
end

#queueObject

Returns the value of attribute queue.



13
14
15
# File 'lib/html5/inputstream.rb', line 13

def queue
  @queue
end

Instance Method Details

#charObject

Read one character from the stream or queue if available. Return EOF when EOF is reached.



289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# File 'lib/html5/inputstream.rb', line 289

def char
  unless @queue.empty?
    return @queue.shift
  else
    if @tell + 3 > @buffer.length && !@raw_stream.eof?
      # read next block
      @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
      @tell = 0
    end

    c = @buffer[@tell]
    @tell += 1

    case c

    when String
      # partial Ruby 1.9 support
      case c
      when "\0"
        @errors.push("null-character")
        c = "\uFFFD" # null characters are invalid
      when "\r"
        @tell += 1 if @buffer[@tell] == "\n"
        c = "\n"
      when "\x80" .. "\x9F"
        c = ENTITIES_WINDOWS1252[c.ord-0x80].chr('utf-8')
      when "\xA0" .. "\xFF"
        if c.encoding == Encoding::ASCII_8BIT
          c = c.encode('utf-8','iso-8859-1')
        end
      end

      if c == "\x0D"
        # normalize newlines
        @tell += 1 if @buffer[@tell] == 0x0A
        c = 0x0A
      end

      # update position in stream
      if c == "\x0a"
        @line_lengths << @col
        @line += 1
        @col = 0
      else
        @col += 1
      end

      c

    when 0x01..0x7F
      if c == 0x0D
        # normalize newlines
        @tell += 1 if @buffer[@tell] == 0x0A
        c = 0x0A
      end

      # update position in stream
      if c == 0x0a
        @line_lengths << @col
        @line += 1
        @col = 0
      else
        @col += 1
      end

      c.chr

    when 0x80..0xBF
      if !@win1252
        [0xFFFD].pack('U') # invalid utf-8
      elsif c <= 0x9f
        [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
      else
        "\xC2" + c.chr # convert to utf-8
      end

    when 0xC0..0xFF
      if instance_variable_defined?("@win1252") && @win1252
        "\xC3" + (c - 64).chr # convert to utf-8


      elsif @buffer[@tell - 1..@tell + 3] =~ VALID_XML_CHARS
        @tell += $1.length - 1
        $1
      else
        [0xFFFD].pack('U') # invalid utf-8
      end

    when 0x00
      @errors.push("null-character")
      [0xFFFD].pack('U') # null characters are invalid

    else
      :EOF
    end
  end
end

#chars_until(characters, opposite = false) ⇒ Object

Returns a string of characters from the stream up to but not including any character in characters or EOF. characters can be any container that supports the in method being called on it.



390
391
392
393
394
395
396
397
398
399
400
401
402
403
# File 'lib/html5/inputstream.rb', line 390

def chars_until(characters, opposite=false)
  char_stack = [char]

  while char_stack.last != :EOF
    break unless (characters.include?(char_stack.last)) == opposite
    char_stack.push(char)
  end

  # Put the character stopped on back to the front of the queue
  # from where it came.
  c = char_stack.pop
  @queue.insert(0, c) unless c == :EOF
  return char_stack.join('')
end

#detect_bomObject

Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return nil



182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/html5/inputstream.rb', line 182

def detect_bom
  bom_dict = {
    "\xef\xbb\xbf"     => 'utf-8',
    "\xff\xfe"         => 'utf-16le',
    "\xfe\xff"         => 'utf-16be',
    "\xff\xfe\x00\x00" => 'utf-32le',
    "\x00\x00\xfe\xff" => 'utf-32be'
  }

  # Go to beginning of file and read in 4 bytes
  string = @raw_stream.read(4)
  return nil unless string

  # Try detecting the BOM using bytes from the string
  encoding = bom_dict[string[0...3]]      # UTF-8
  seek = 3
  unless encoding
    # Need to detect UTF-32 before UTF-16
    encoding = bom_dict[string]       # UTF-32
    seek = 4
    unless encoding
      encoding = bom_dict[string[0...2]]  # UTF-16
      seek = 2
    end
  end

  # Set the read position past the BOM if one was found, otherwise
  # set it to the start of the stream
  seek(string, encoding ? seek : 0)

  return encoding
end

#detect_encodingObject



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/html5/inputstream.rb', line 129

def detect_encoding

  #First look for a BOM
  #This will also read past the BOM if present
  encoding = detect_bom

  #If there is no BOM need to look for meta elements with encoding 
  #information
  if encoding.nil? and @parse_meta
    encoding = detect_encoding_meta
  end

  #Guess with chardet, if avaliable
  if encoding.nil? and @chardet
    begin
      require 'rubygems'
      require 'UniversalDetector' # gem install chardet
      buffers = []
      detector = UniversalDetector::Detector.instance
      detector.reset
      until @raw_stream.eof?
        buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
        break if !buffer or buffer.empty?
        buffers << buffer
        detector.feed(buffer)
        break if detector.instance_eval {@done}
        detector.instance_eval {
          @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
        }
      end
      detector.close
      encoding = detector.result['encoding']
      seek(buffers*'', 0)
    rescue LoadError
    end
  end

  # If all else fails use the default encoding
  if encoding.nil?
    encoding = @DEFAULT_ENCODING
  end

  #Substitute for equivalent encoding
  if 'iso-8859-1' == encoding.downcase
    encoding = 'windows-1252'
  end

  encoding
end

#detect_encoding_metaObject

Report the encoding declared by the meta element



263
264
265
266
267
268
# File 'lib/html5/inputstream.rb', line 263

def detect_encoding_meta
  buffer = @raw_stream.read(@NUM_BYTES_META)
  parser = EncodingParser.new(buffer)
  seek(buffer, 0)
  return parser.get_encoding
end

#open_stream(source) ⇒ Object

Produces a file object from source.

source can be either a file object, local filename or a string.



119
120
121
122
123
124
125
126
127
# File 'lib/html5/inputstream.rb', line 119

def open_stream(source)
  # Already an IO like object
  if source.respond_to?(:read)
    source
  else
    # Treat source as a string and wrap in StringIO
    StringIO.new(source)
  end
end

#positionObject

Returns (line, col) of the current position in the stream.



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# File 'lib/html5/inputstream.rb', line 271

def position
  line, col = @line, @col
  if @queue and @queue.last != :EOF
    @queue.reverse.each do |c|
      if c == "\n"
        line -= 1
        raise RuntimeError.new("col=#{col}") unless col == 0
        col = @line_lengths[line]
      else
        col -= 1
      end 
    end
  end
  return [line + 1, col]
end

#seek(buffer, n) ⇒ Object



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/html5/inputstream.rb', line 215

def seek(buffer, n)
  if @raw_stream.respond_to?(:unget)
    @raw_stream.unget(buffer[n..-1])
    return
  end

  if @raw_stream.respond_to?(:seek)
    begin
      @raw_stream.seek(n)
      return
    rescue Errno::ESPIPE
    end
  end

  #TODO: huh?
  require 'delegate'
  @raw_stream = SimpleDelegator.new(@raw_stream)

  class << @raw_stream
    def read(chars=-1)
      if chars == -1 or chars > @data.length
        result = @data
        @data = ''
        return result if __getobj__.eof?
        return result + __getobj__.read if chars == -1
        return result + __getobj__.read(chars-result.length)
      elsif @data.empty?
        return __getobj__.read(chars)
      else
        result = @data[1...chars]
        @data = @data[chars..-1]
        return result
      end
    end

    def unget(data)
      if !@data or @data.empty?
        @data = data
      else
        @data += data
      end
    end
  end

  @raw_stream.unget(buffer[n .. -1])
end

#unget(characters) ⇒ Object



405
406
407
408
409
410
411
412
# File 'lib/html5/inputstream.rb', line 405

def unget(characters)
  return if characters == :EOF
  if characters.respond_to? :to_a
    @queue.unshift(*characters.to_a)
  else
    characters.reverse.each_char {|c| @queue.unshift(c)}
  end
end