Module: Rucc::UTF

Defined in:
lib/rucc/utf.rb

Class Method Summary collapse

Class Method Details

.count_leading_ones(c) ⇒ Integer

Parameters:

  • c (Integer)

Returns:

  • (Integer)


116
117
118
119
120
121
122
123
# File 'lib/rucc/utf.rb', line 116

def count_leading_ones(c)
  7.downto(0).each do |i|
    if (c & (1 << i)) == 0
      return 7 - i
    end
  end
  8
end

.read_rune(s) ⇒ <Integer, <Integer>>

Parameters:

  • s (<Integer>)

Returns:

  • (<Integer, <Integer>>)


82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/rucc/utf.rb', line 82

def read_rune(s)
  len = count_leading_ones(s[0])
  if len == 0
    return s[0], s[1..-1]
  end
  if len > s.size
    raise "invalid UTF-8 sequence"
    # error("invalid UTF-8 sequence");
  end
  1.upto(len - 1).each do |i|
    if (s[i] & 0xC0) != 0x80
      raise "invalid UTF-8 continuation byte"
      # error("invalid UTF-8 continuation byte");
    end
  end

  case len
  when 2
    r = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F)
    return r, s[2..-1]
  when 3
    r = ((s[0] & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F)
    return r, s[3..-1]
  when 4
    r = ((s[0] & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F)
    return r, s[4..-1]
  else
    raise "invalid UTF-8 sequence"
    # error("invalid UTF-8 sequence");
  end
end

.to_utf16(str) ⇒ String

Parameters:

  • str (String)

Returns:

  • (String)


39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/rucc/utf.rb', line 39

def to_utf16(str)
  b = ""
  bytes = str.bytes
  while bytes.size > 0
    rune, bytes = read_rune(bytes)
    if rune < 0x10000
      write16(b, rune)
    else
      write16(b, (rune >> 10) + 0xD7C0)
      write16(b, (rune & 0x3FF) + 0xDC00)
    end
  end
  b
end

.to_utf32(str) ⇒ String

Parameters:

  • str (String)

Returns:

  • (String)


56
57
58
59
60
61
62
63
64
# File 'lib/rucc/utf.rb', line 56

def to_utf32(str)
  b = ""
  bytes = str.bytes
  while bytes.size > 0
    rune, bytes = read_rune(bytes)
    write32(b, rune)
  end
  b
end

.write16(b, rune) ⇒ Object

@param(return) [String] b

Parameters:

  • rune (Integer)


68
69
70
71
# File 'lib/rucc/utf.rb', line 68

def write16(b, rune)
  b << (rune & 0xFF)
  b << (rune >> 8)
end

.write32(b, rune) ⇒ Object

@param(return) [String] b

Parameters:

  • rune (Integer)


75
76
77
78
# File 'lib/rucc/utf.rb', line 75

def write32(b, rune)
  write16(b, rune & 0xFFFF)
  write16(b, rune >> 16)
end

.write_utf8(b, rune) ⇒ Object

@param(return) [String] b

Parameters:

  • rune (Integer)


6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/rucc/utf.rb', line 6

def write_utf8(b, rune)
  # In ruby, default encoding is UTF-8, so `String#<<` can append rune
  # as UTF-8 string
  b << rune

  # if rune < 0x80
  #   b << rune
  #   return
  # end
  # if rune < 0x800
  #   b << (0xC0 | (rune >> 6))
  #   b << (0x80 | (rune & 0x3F))
  #   return
  # end
  # if rune < 0x10000
  #   b << (0xE0 | (rune >> 12))
  #   b << (0x80 | ((rune >> 6) & 0x3F))
  #   b << (0x80 | (rune & 0x3F))
  #   return
  # end
  # if rune < 0x200000
  #   b << (0xF0 | (rune >> 18))
  #   b << (0x80 | ((rune >> 12) & 0x3F))
  #   b << (0x80 | ((rune >> 6) & 0x3F))
  #   b << (0x80 | (rune & 0x3F))
  #   return
  # end
  # raise "invalid UCS character: \\U#{format("%08d", rune)}"
  # error("invalid UCS character: \\U%08x", rune);
end