Class: CMess::GuessEncoding::Automatic

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Encoding
Defined in:
lib/cmess/guess_encoding/automatic.rb

Overview

Tries to detect the encoding of a given input by applying several heuristics to determine the most likely candidate. If no heuristic catches on, resorts to Encoding::UNKNOWN.

If a BOM is found, it may determine the encoding directly.

Constant Summary collapse

ICONV_FOR =

Creates a converter for desired encoding (from UTF-8)

Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
TEST_ENCODINGS =

Single-byte encodings to test statistically by TEST_CHARS

[
  MACINTOSH,
  ISO_8859_1,
  ISO_8859_15,
  CP1252,
  CP850,
  MS_ANSI
]
CHARS_TO_TEST =

Certain (non-ASCII) chars to test for in TEST_ENCODINGS

(
  '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
  'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
).split(//)
TEST_CHARS =

Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST

Hash.new { |hash, encoding|
  encoding = Encoding.get_or_set_encoding_const(encoding)
  encchars = CHARS_TO_TEST.map { |char|
    begin
      byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
    rescue Iconv::IllegalSequence
    end
  }.compact

  TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
  hash[encoding] = encchars
}.update(YAML.load_file(
  File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
))
TEST_THRESHOLD_DIRECT =

Relative count of TEST_CHARS must exceed this threshold to yield a direct match

0.1
TEST_THRESHOLD_APPROX =

Relative count of TEST_CHARS must exceed this threshold to yield an approximate match

0.0004

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Encoding

#all_encodings

Constructor Details

#initialize(input, chunk_size = nil) ⇒ Automatic

Returns a new instance of Automatic.



154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/cmess/guess_encoding/automatic.rb', line 154

def initialize(input, chunk_size = nil)
  @input = case input
    when IO      # that's what we want
      input
    when String  # convert it to an IO
      StringIO.new(input)
    else         # um, what's that...?
      raise ArgumentError, "don't know how to handle input of type #{input.class}"
  end

  @chunk_size = chunk_size
end

Class Attribute Details

.bom_guessersObject (readonly)

Returns the value of attribute bom_guessers.



107
108
109
# File 'lib/cmess/guess_encoding/automatic.rb', line 107

def bom_guessers
  @bom_guessers
end

.encoding_guessersObject (readonly)

Returns the value of attribute encoding_guessers.



107
108
109
# File 'lib/cmess/guess_encoding/automatic.rb', line 107

def encoding_guessers
  @encoding_guessers
end

.supported_bomsObject (readonly)

Returns the value of attribute supported_boms.



107
108
109
# File 'lib/cmess/guess_encoding/automatic.rb', line 107

def supported_boms
  @supported_boms
end

.supported_encodingsObject (readonly)

Returns the value of attribute supported_encodings.



107
108
109
# File 'lib/cmess/guess_encoding/automatic.rb', line 107

def supported_encodings
  @supported_encodings
end

Instance Attribute Details

#byte_countObject (readonly)

Returns the value of attribute byte_count.



152
153
154
# File 'lib/cmess/guess_encoding/automatic.rb', line 152

def byte_count
  @byte_count
end

#byte_totalObject (readonly)

Returns the value of attribute byte_total.



152
153
154
# File 'lib/cmess/guess_encoding/automatic.rb', line 152

def byte_total
  @byte_total
end

#chunk_sizeObject (readonly)

Returns the value of attribute chunk_size.



152
153
154
# File 'lib/cmess/guess_encoding/automatic.rb', line 152

def chunk_size
  @chunk_size
end

#first_byteObject (readonly)

Returns the value of attribute first_byte.



152
153
154
# File 'lib/cmess/guess_encoding/automatic.rb', line 152

def first_byte
  @first_byte
end

#inputObject (readonly)

Returns the value of attribute input.



152
153
154
# File 'lib/cmess/guess_encoding/automatic.rb', line 152

def input
  @input
end

Class Method Details

.guess(input, chunk_size = nil, ignore_bom = false) ⇒ Object



110
111
112
# File 'lib/cmess/guess_encoding/automatic.rb', line 110

def guess(input, chunk_size = nil, ignore_bom = false)
  new(input, chunk_size).guess(ignore_bom)
end

Instance Method Details

#bomObject



181
182
183
# File 'lib/cmess/guess_encoding/automatic.rb', line 181

def bom
  @bom ||= check_bom
end

#guess(ignore_bom = false) ⇒ Object



167
168
169
170
171
172
173
174
175
176
177
178
179
# File 'lib/cmess/guess_encoding/automatic.rb', line 167

def guess(ignore_bom = false)
  return bom if bom && !ignore_bom

  while read
    encoding_guessers.each { |block|
      encoding = instance_eval(&block)
      return encoding if encoding && supported_encoding?(encoding)
    }
  end

  # nothing suitable found :-(
  UNKNOWN
end