Module: UTF8Utils::StringExt

Included in:
String
Defined in:
lib/utf8_utils.rb

Overview

A mixin to Ruby’s String class to add the #tidy_bytes and #tidy_bytes! methods.

Instance Method Summary collapse

Instance Method Details

#tidy_bytes(force = false) ⇒ Object

Attempt to replace invalid UTF-8 bytes with valid ones. This method naively assumes if you have invalid UTF8 bytes, they are either Windows CP1251 or ISO8859-1. In practice this isn’t a bad assumption, but may not always work.

Passing true will forcibly tidy all bytes, assuming that the string’s encoding is CP1252 or ISO-8859-1.



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/utf8_utils.rb', line 51

def tidy_bytes(force = false)

  if force
    return unpack("C*").map do |b|
      tidy_byte(b)
    end.flatten.compact.pack("C*").unpack("U*").pack("U*")
  end

  bytes = unpack("C*")
  conts_expected = 0
  last_lead = 0

  bytes.each_index do |i|

    byte          = bytes[i]
    is_ascii      = byte < 128
    is_cont       = byte > 127 && byte < 192
    is_lead       = byte > 191 && byte < 245
    is_unused     = byte > 240
    is_restricted = byte > 244

    # Impossible or highly unlikely byte? Clean it.
    if is_unused || is_restricted
      bytes[i] = tidy_byte(byte)
    elsif is_cont
      # Not expecting contination byte? Clean up. Otherwise, now expect one less.
      conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
    else
      if conts_expected > 0
        # Expected continuation, but got ASCII or leading? Clean backwards up to
        # the leading byte.
        (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
        conts_expected = 0
      end
      if is_lead
        # Final byte is leading? Clean it.
        if i == bytes.length - 1
          bytes[i] = tidy_byte(bytes.last)
        else
          # Valid leading byte? Expect continuations determined by position of
          # first zero bit, with max of 3.
          conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
          last_lead = i
        end
      end
    end
  end
  bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end

#tidy_bytes!Object

Tidy bytes in-place.



102
103
104
# File 'lib/utf8_utils.rb', line 102

def tidy_bytes!
  replace tidy_bytes
end