Module: UTF8Utils::StringExt

Included in:
String
Defined in:
lib/utf8_utils.rb

Overview

A mixin to Ruby’s String class to add the #tidy_bytes and #tidy_bytes! methods.

Instance Method Summary collapse

Instance Method Details

#tidy_bytesObject

Attempt to replace invalid UTF-8 bytes with valid ones. This method naively assumes if you have invalid UTF8 bytes, they are either Windows CP1251 or ISO8859-1. In practice this isn’t a bad assumption, but may not always work.



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/utf8_utils.rb', line 48

def tidy_bytes

  bytes = unpack("C*")
  continuation_bytes_expected = 0

  bytes.each_index do |index|

    byte = bytes[index]

    is_continuation_byte = byte[7] == 1 && byte[6] == 0
    ascii_byte = byte[7] == 0
    leading_byte = byte[7] == 1 && byte[6] == 1

    if is_continuation_byte
      if continuation_bytes_expected > 0
        continuation_bytes_expected = continuation_bytes_expected - 1
      else
        # Not expecting a continuation, so clean it
        bytes[index] = tidy_byte(byte)
      end
    # ASCII byte
    elsif ascii_byte
      if continuation_bytes_expected > 0
        # Expected continuation, got ASCII, so clean previous
        bytes[index - 1] = tidy_byte(bytes[index - 1])
        continuation_bytes_expected = 0
      end
    elsif leading_byte
      if continuation_bytes_expected > 0
        # Expected continuation, got leading, so clean previous
        bytes[index - 1] = tidy_byte(bytes[index - 1])
        continuation_bytes_expected = 0
      end
      continuation_bytes_expected =
        if    byte[5] == 0 then 1
        elsif byte[4] == 0 then 2
        elsif byte[3] == 0 then 3
      end
    end
    # Don't allow the string to terminate with a leading byte
    if leading_byte && index == bytes.length - 1
      bytes[index] = tidy_byte(bytes.last)
    end
  end
  bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end

#tidy_bytes!Object

Tidy bytes in-place.



96
97
98
# File 'lib/utf8_utils.rb', line 96

def tidy_bytes!
  replace tidy_bytes
end