Method: UTF8Utils::StringExt#tidy_bytes

Defined in:: lib/utf8_utils.rb

#tidy_bytes ⇒ `Object`

Attempt to replace invalid UTF-8 bytes with valid ones. This method naively assumes if you have invalid UTF8 bytes, they are either Windows CP1251 or ISO8859-1. In practice this isn’t a bad assumption, but may not always work.

# File 'lib/utf8_utils.rb', line 48

def tidy_bytes

  bytes = unpack("C*")
  continuation_bytes_expected = 0

  bytes.each_index do |index|

    byte = bytes[index]

    is_continuation_byte = byte[7] == 1 && byte[6] == 0
    ascii_byte = byte[7] == 0
    leading_byte = byte[7] == 1 && byte[6] == 1

    if is_continuation_byte
      if continuation_bytes_expected > 0
        continuation_bytes_expected = continuation_bytes_expected - 1
      else
        # Not expecting a continuation, so clean it
        bytes[index] = tidy_byte(byte)
      end
    # ASCII byte
    elsif ascii_byte
      if continuation_bytes_expected > 0
        # Expected continuation, got ASCII, so clean previous
        bytes[index - 1] = tidy_byte(bytes[index - 1])
        continuation_bytes_expected = 0
      end
    elsif leading_byte
      if continuation_bytes_expected > 0
        # Expected continuation, got leading, so clean previous
        bytes[index - 1] = tidy_byte(bytes[index - 1])
        continuation_bytes_expected = 0
      end
      continuation_bytes_expected =
        if    byte[5] == 0 then 1
        elsif byte[4] == 0 then 2
        elsif byte[3] == 0 then 3
      end
    end
    # Don't allow the string to terminate with a leading byte
    if leading_byte && index == bytes.length - 1
      bytes[index] = tidy_byte(bytes.last)
    end
  end
  bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end

Method: UTF8Utils::StringExt#tidy_bytes

#tidy_bytes ⇒ Object

#tidy_bytes ⇒ `Object`