Method: UTF8Utils::StringExt#tidy_bytes
- Defined in:
- lib/utf8_utils.rb
#tidy_bytes ⇒ Object
Attempt to replace invalid UTF-8 bytes with valid ones. This method naively assumes if you have invalid UTF8 bytes, they are either Windows CP1251 or ISO8859-1. In practice this isn’t a bad assumption, but may not always work.
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/utf8_utils.rb', line 48 def tidy_bytes bytes = unpack("C*") continuation_bytes_expected = 0 bytes.each_index do |index| byte = bytes[index] is_continuation_byte = byte[7] == 1 && byte[6] == 0 ascii_byte = byte[7] == 0 leading_byte = byte[7] == 1 && byte[6] == 1 if is_continuation_byte if continuation_bytes_expected > 0 continuation_bytes_expected = continuation_bytes_expected - 1 else # Not expecting a continuation, so clean it bytes[index] = tidy_byte(byte) end # ASCII byte elsif ascii_byte if continuation_bytes_expected > 0 # Expected continuation, got ASCII, so clean previous bytes[index - 1] = tidy_byte(bytes[index - 1]) continuation_bytes_expected = 0 end elsif leading_byte if continuation_bytes_expected > 0 # Expected continuation, got leading, so clean previous bytes[index - 1] = tidy_byte(bytes[index - 1]) continuation_bytes_expected = 0 end continuation_bytes_expected = if byte[5] == 0 then 1 elsif byte[4] == 0 then 2 elsif byte[3] == 0 then 3 end end # Don't allow the string to terminate with a leading byte if leading_byte && index == bytes.length - 1 bytes[index] = tidy_byte(bytes.last) end end bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") end |