Class: Nokogiri::HTML::Document::EncodingReader

Inherits:
Object
  • Object
show all
Defined in:
lib/nokogiri/html/document.rb

Overview

:nodoc:

Defined Under Namespace

Classes: JumpSAXHandler, SAXHandler

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(io) ⇒ EncodingReader

Returns a new instance of EncodingReader.



214
215
216
217
218
# File 'lib/nokogiri/html/document.rb', line 214

def initialize(io)
  @io = io
  @firstchunk = nil
  @encoding_found = nil
end

Instance Attribute Details

#encoding_foundObject (readonly)

This method is used by the C extension so that Nokogiri::HTML::Document#read_io() does not leak memory when EncodingFound is raised.



223
224
225
# File 'lib/nokogiri/html/document.rb', line 223

def encoding_found
  @encoding_found
end

Class Method Details

.detect_encoding(chunk) ⇒ Object



172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/nokogiri/html/document.rb', line 172

def self.detect_encoding(chunk)
  if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
    return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
  end
  m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
    return Nokogiri.XML(m[1]).encoding

  if Nokogiri.jruby?
    m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
      return m[4]
    catch(:encoding_found) {
      Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
      nil
    }
  else
    handler = SAXHandler.new
    parser = Nokogiri::HTML::SAX::PushParser.new(handler)
    parser << chunk rescue Nokogiri::SyntaxError
    handler.encoding
  end
end

.detect_encoding_for_jruby_without_fix(chunk) ⇒ Object



198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/nokogiri/html/document.rb', line 198

def self.detect_encoding_for_jruby_without_fix(chunk)
  m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
    return Nokogiri.XML(m[1]).encoding

  m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
    return m[4]

  catch(:encoding_found) {
    Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
    nil
  }
rescue Nokogiri::SyntaxError, RuntimeError
  # Ignore parser errors that nokogiri may raise
  nil
end

.is_jruby_without_fix?Boolean

Returns:

  • (Boolean)


194
195
196
# File 'lib/nokogiri/html/document.rb', line 194

def self.is_jruby_without_fix?
  JRUBY_VERSION.split('.').join.to_i < 165
end

Instance Method Details

#read(len) ⇒ Object



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/nokogiri/html/document.rb', line 225

def read(len)
  # no support for a call without len

  if !@firstchunk
    @firstchunk = @io.read(len) or return nil

    # This implementation expects that the first call from
    # htmlReadIO() is made with a length long enough (~1KB) to
    # achieve advanced encoding detection.
    if encoding = EncodingReader.detect_encoding(@firstchunk)
      # The first chunk is stored for the next read in retry.
      raise @encoding_found = EncodingFound.new(encoding)
    end
  end
  @encoding_found = nil

  ret = @firstchunk.slice!(0, len)
  if (len -= ret.length) > 0
    rest = @io.read(len) and ret << rest
  end
  if ret.empty?
    nil
  else
    ret
  end
end