Method: EZML::Util#check_encoding

Defined in:
lib/ezml/util.rb

#check_encoding(str) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/ezml/util.rb', line 21

def check_encoding(str)
  if str.valid_encoding?
    # Get rid of the Unicode BOM if possible
    # Shortcut for UTF-8 which might be the majority case
    if str.encoding == Encoding::UTF_8
      return str.gsub(/\A\uFEFF/, '')
    elsif str.encoding.name =~ /^UTF-(16|32)(BE|LE)?$/
      return str.gsub(Regexp.new("\\A\uFEFF".encode(str.encoding)), '')
    else
      return str
    end
  end

  encoding = str.encoding
  newlines = Regexp.new("\r\n|\r|\n".encode(encoding).force_encoding(Encoding::ASCII_8BIT))
  str.force_encoding(Encoding::ASCII_8BIT).split(newlines).each_with_index do |line, i|
    begin
      line.encode(encoding)
    rescue Encoding::UndefinedConversionError => e
      yield <<MSG.rstrip, i + 1
Invalid #{encoding.name} character #{e.error_char.dump}
MSG
    end
  end
  return str
end