Class: UTF8Utils::Codepoint

Inherits:
Array
  • Object
show all
Defined in:
lib/utf8_utils.rb

Instance Method Summary collapse

Instance Method Details

#invalid?Boolean

Returns:

  • (Boolean)


139
140
141
# File 'lib/utf8_utils.rb', line 139

def invalid?
  !valid?
end

#tidyObject

Attempt to rescue a valid UTF-8 character from a malformed codepoint. It will first attempt to convert from CP1251, and if this isn’t possible, it prepends a valid leading byte, treating the character as the last byte in a two-byte codepoint. Note that much of the logic here is taken from ActiveSupport; the difference is that this works for Ruby 1.8.6 - 1.9.1.



128
129
130
131
132
133
134
135
136
137
# File 'lib/utf8_utils.rb', line 128

def tidy
  return self if valid?
  if Codepoints::CP1251.key? self[0]
    self.class.new [Codepoints::CP1251[self[0]]]
  elsif self[0] < 192
    self.class.new [194, self[0]]
  else
    self.class.new [195, self[0] - 64]
  end
end

#to_charObject

Get a character from the bytes.



144
145
146
# File 'lib/utf8_utils.rb', line 144

def to_char
  flatten.pack("C*").unpack("U*").pack("U*")
end

#valid?Boolean

Borrowed from the regexp in ActiveSupport, which in turn had been borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site). See also en.wikipedia.org/wiki/UTF-8

Returns:

  • (Boolean)


108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/utf8_utils.rb', line 108

def valid?
 if length == 1
   (0..127) === self[0]
 elsif length == 2
   (192..223) === self[0] &&  (128..191) === self[1]
 elsif length == 3
   (self[0] == 224 && ((160..191) === self[1] && (128..191) === self[2])) ||
   ((225..239) === self[0] && (128..191) === self[1] && (128..191) === self[2])
 elsif length == 4
   (self[0] == 240 && (144..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
   ((241..243) === self[0] && (128..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
   (self[0] == 244 && (128..143) === self[1] && (128..191) === self[2] && (128..191) === self[3])
 end
end