Module: Ensure::Encoding

Defined in:
lib/ensure/encoding.rb

Defined Under Namespace

Modules: String

Constant Summary collapse

BYTE_ORDER_MARKS =
{
  ::Encoding::UTF_16BE => [0xfe, 0xff],
  ::Encoding::UTF_16LE => [0xff, 0xfe],
  ::Encoding::UTF_8    => [0xef, 0xbb, 0xbf]
}

Class Method Summary collapse

Class Method Details

.force_encoding(string, target_encoding, options = {}) ⇒ Object

Forces the encoding of string to target_encoding and using a number of smart tricks. See String#ensure_encoding for more details.



43
44
45
46
47
# File 'lib/ensure/encoding.rb', line 43

def self.force_encoding(string, target_encoding, options={})
  target_string = string.dup
  force_encoding!(target_string, target_encoding, options)
  target_string
end

.force_encoding!(string, target_encoding, options = {}) ⇒ Object

Performs just like force_encoding, only it changes the string in place instead of returning it.



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/ensure/encoding.rb', line 51

def self.force_encoding!(string, target_encoding, options={})
  if options[:external_encoding] == :sniff
    external_encoding = sniff_encoding(string)
  else
    external_encoding = options[:external_encoding] || [target_encoding, string.encoding]
  end
  
  if external_encoding.respond_to?(:each)
    external_encoding = guess_encoding(string, external_encoding) || target_encoding
  end
  
  if options[:invalid_characters] == :raise
    string.force_encoding(target_encoding)
    raise ::Encoding::InvalidByteSequenceError, "String is not encoded as `#{target_encoding}'" unless string.valid_encoding?
  else
    filters = (options[:invalid_characters] == :drop) ? { :replace => '', :undef => :replace, :invalid => :replace } : {}
    string.encode!(target_encoding, external_encoding, **filters)  # https://piechowski.io/post/last-arg-keyword-deprecated-ruby-2-7/
  end
end

.guess_encoding(string, guesses) ⇒ Object

Checks the encodings in guesses from front to back and returns the first encoding in which the character data is a valid sequence.



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/ensure/encoding.rb', line 25

def self.guess_encoding(string, guesses)
  original_encoding = string.encoding
  guessed_encoding = nil
  
  guesses.each do |guess|
    string.force_encoding(guess)
    if string.valid_encoding?
      guessed_encoding = string.encoding
      break
    end
  end
  
  string.force_encoding(original_encoding)
  guessed_encoding
end

.sniff_encoding(string) ⇒ Object

Tries to guess the encoding of the string and returns the most likely encoding.



13
14
15
16
17
18
19
20
21
# File 'lib/ensure/encoding.rb', line 13

def self.sniff_encoding(string)
  first_bytes = string.unpack('C3')
  BYTE_ORDER_MARKS.each do |encoding, bytes|
    if first_bytes[0...bytes.length] == bytes
      return encoding
    end
  end
  ::Encoding::UTF_8
end