Module: UnicodeUtils::Impl
- Defined in:
- lib/unicode_utils/nfc.rb,
lib/unicode_utils/each_word.rb,
lib/unicode_utils/read_cdata.rb,
lib/unicode_utils/conditional_casing.rb,
lib/unicode_utils/canonical_decomposition.rb,
lib/unicode_utils/compatibility_decomposition.rb,
lib/unicode_utils/hangul_syllable_decomposition.rb
Overview
Defined Under Namespace
Modules: NFC
Classes: AfterIConditionalCasing, AfterSoftDottedConditionalCasing, BeforeDotConditionalCasing, ConditionalCasing, FinalSigmaConditionalCasing, MoreAboveConditionalCasing, NotBeforeDotConditionalCasing
Constant Summary
collapse
- COMPOSITION_EXCLUSION_SET =
Impl.read_codepoint_set("composition_exclusion_set")
- CANONICAL_COMPOSITION_MAP =
Hash.new.tap do |m|
CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
if decomp.length == 2
(m[decomp[0]] ||= {})[decomp[1]] = comp
end
}
end
- LANGS_WITH_RULES =
{:tr => true, :lt => true, :az => true}
- CONDITIONAL_UPCASE_MAP =
read_conditional_casings("cond_uc_map")
- CONDITIONAL_DOWNCASE_MAP =
read_conditional_casings("cond_lc_map")
- CONDITIONAL_TITLECASE_MAP =
read_conditional_casings("cond_tc_map")
Class Method Summary
collapse
-
.append_hangul_syllable_decomposition(str, s) ⇒ Object
-
.append_recursive_canonical_decomposition_mapping(str, mapping) ⇒ Object
-
.append_recursive_compatibility_decomposition_mapping(str, cp) ⇒ Object
-
.composition(str) ⇒ Object
-
.conditional_downcase_mapping(cp, str, pos, language_id) ⇒ Object
-
.conditional_titlecase_mapping(cp, str, pos, language_id) ⇒ Object
-
.conditional_upcase_mapping(cp, str, pos, language_id) ⇒ Object
-
.open_cdata_file(filename, &block) ⇒ Object
-
.put_into_canonical_order(str) ⇒ Object
-
.read_codepoint_map(filename) ⇒ Object
-
.read_codepoint_set(filename) ⇒ Object
-
.read_combining_class_map ⇒ Object
-
.read_conditional_casings(filename) ⇒ Object
-
.read_hexdigit_map(filename) ⇒ Object
Read a map whose keys are codepoints (6 hexgdigits, converted to integer) and whose values are single hexdigits (converted to integer).
-
.read_multivalued_map(filename) ⇒ Object
-
.read_names(filename) ⇒ Object
-
.word_break?(cs, i) ⇒ Boolean
Class Method Details
.append_hangul_syllable_decomposition(str, s) ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/unicode_utils/hangul_syllable_decomposition.rb', line 20
def self.append_hangul_syllable_decomposition(str, s)
sbase = 0xAC00
lbase = 0x1100
vbase = 0x1161
tbase = 0x11A7
scount = 11172
lcount = 19
vcount = 21
tcount = 28
ncount = vcount * tcount
sindex = s - sbase
if 0 <= sindex && sindex < scount
l = lbase + sindex / ncount
v = vbase + (sindex % ncount) / tcount
t = tbase + sindex % tcount
str << l << v
str << t if t != tbase
else
str << s
end
end
|
.append_recursive_canonical_decomposition_mapping(str, mapping) ⇒ Object
48
49
50
51
52
53
54
55
56
57
|
# File 'lib/unicode_utils/canonical_decomposition.rb', line 48
def self.append_recursive_canonical_decomposition_mapping(str, mapping)
mapping.each { |cp|
mapping_ = CANONICAL_DECOMPOSITION_MAP[cp]
if mapping_
append_recursive_canonical_decomposition_mapping(str, mapping_)
else
str << cp
end
}
end
|
.append_recursive_compatibility_decomposition_mapping(str, cp) ⇒ Object
41
42
43
44
45
46
47
48
49
50
51
|
# File 'lib/unicode_utils/compatibility_decomposition.rb', line 41
def self.append_recursive_compatibility_decomposition_mapping(str, cp)
mapping = COMPATIBILITY_DECOMPOSITION_MAP[cp]
mapping ||= CANONICAL_DECOMPOSITION_MAP[cp]
if mapping
mapping.each { |c|
append_recursive_compatibility_decomposition_mapping(str, c)
}
else
str << cp
end
end
|
.composition(str) ⇒ Object
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
# File 'lib/unicode_utils/nfc.rb', line 46
def self.composition(str)
sbase = 0xAC00
lbase = 0x1100
vbase = 0x1161
tbase = 0x11A7
lcount = 19
vcount = 21
tcount = 28
ncount = vcount * tcount
scount = lcount * ncount
String.new.force_encoding(str.encoding).tap do |res|
last_starter = nil
uncomposable_non_starters = []
str.each_codepoint { |cp|
if COMBINING_CLASS_MAP[cp] == 0
combined = false
if last_starter && uncomposable_non_starters.empty?
lindex = last_starter - lbase
if 0 <= lindex && lindex < lcount
vindex = cp - vbase
if 0 <= vindex && vindex <= vcount
last_starter =
sbase + (lindex * vcount + vindex) * tcount
combined = true
end
end
unless combined
sindex = last_starter - sbase
if 0 <= sindex && sindex < scount && (sindex % tcount) == 0
tindex = cp - tbase
if 0 <= tindex && tindex < tcount
last_starter += tindex
combined = true
end
end
end
unless combined
map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
composition = map && map[cp]
if composition && Impl::NFC.primary_composite?(composition)
last_starter = composition
combined = true
end
end
end
unless combined
res << last_starter if last_starter
uncomposable_non_starters.each { |nc| res << nc }
uncomposable_non_starters.clear
last_starter = cp
end
else
last_non_starter = uncomposable_non_starters.last
if last_non_starter && Impl::NFC.blocked?(last_non_starter, cp)
uncomposable_non_starters << cp
else
map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
composition = map && map[cp]
if composition && Impl::NFC.primary_composite?(composition)
last_starter = composition
else
uncomposable_non_starters << cp
end
end
end
}
res << last_starter if last_starter
uncomposable_non_starters.each { |nc| res << nc }
end
end
|
.conditional_downcase_mapping(cp, str, pos, language_id) ⇒ Object
140
141
142
143
144
145
146
147
148
|
# File 'lib/unicode_utils/conditional_casing.rb', line 140
def self.conditional_downcase_mapping(cp, str, pos, language_id)
lang_map = CONDITIONAL_DOWNCASE_MAP[cp]
if lang_map
casing = lang_map[language_id] || lang_map[nil]
if casing && casing.context_match?(str, pos)
casing.mapping
end
end
end
|
.conditional_titlecase_mapping(cp, str, pos, language_id) ⇒ Object
150
151
152
153
154
155
156
157
158
|
# File 'lib/unicode_utils/conditional_casing.rb', line 150
def self.conditional_titlecase_mapping(cp, str, pos, language_id)
lang_map = CONDITIONAL_TITLECASE_MAP[cp]
if lang_map
casing = lang_map[language_id] || lang_map[nil]
if casing && casing.context_match?(str, pos)
casing.mapping
end
end
end
|
.conditional_upcase_mapping(cp, str, pos, language_id) ⇒ Object
130
131
132
133
134
135
136
137
138
|
# File 'lib/unicode_utils/conditional_casing.rb', line 130
def self.conditional_upcase_mapping(cp, str, pos, language_id)
lang_map = CONDITIONAL_UPCASE_MAP[cp]
if lang_map
casing = lang_map[language_id] || lang_map[nil]
if casing && casing.context_match?(str, pos)
casing.mapping
end
end
end
|
.open_cdata_file(filename, &block) ⇒ Object
11
12
13
|
# File 'lib/unicode_utils/read_cdata.rb', line 11
def self.open_cdata_file(filename, &block)
File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
end
|
.put_into_canonical_order(str) ⇒ Object
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
# File 'lib/unicode_utils/canonical_decomposition.rb', line 59
def self.put_into_canonical_order(str)
reorder_needed = false
last_cp = nil
last_cc = nil
str.each_codepoint { |cp|
cc = COMBINING_CLASS_MAP[cp]
if last_cp && cc != 0 && last_cc > cc
reorder_needed = true
break
end
last_cp = cp
last_cc = cc
}
return str unless reorder_needed
res = String.new.force_encoding(str.encoding)
last_cp = nil
last_cc = nil
str.each_codepoint { |cp|
cc = COMBINING_CLASS_MAP[cp]
if last_cp
if cc != 0 && last_cc > cc
res << cp
cp = nil
cc = nil
end
res << last_cp
end
last_cp = cp
last_cc = cc
}
res << last_cp if last_cp
put_into_canonical_order(res)
end
|
.read_codepoint_map(filename) ⇒ Object
27
28
29
30
31
32
33
34
35
36
37
|
# File 'lib/unicode_utils/read_cdata.rb', line 27
def self.read_codepoint_map(filename)
Hash.new.tap { |map|
open_cdata_file(filename) do |input|
buffer = "x" * 6
buffer.force_encoding(Encoding::US_ASCII)
while input.read(6, buffer)
map[buffer.to_i(16)] = input.read(6, buffer).to_i(16)
end
end
}
end
|
.read_codepoint_set(filename) ⇒ Object
15
16
17
18
19
20
21
22
23
24
25
|
# File 'lib/unicode_utils/read_cdata.rb', line 15
def self.read_codepoint_set(filename)
Hash.new.tap { |set|
open_cdata_file(filename) do |input|
buffer = "x" * 6
buffer.force_encoding(Encoding::US_ASCII)
while input.read(6, buffer)
set[buffer.to_i(16)] = true
end
end
}
end
|
.read_combining_class_map ⇒ Object
85
86
87
88
89
90
91
92
93
94
95
96
97
|
# File 'lib/unicode_utils/read_cdata.rb', line 85
def self.read_combining_class_map
Hash.new.tap { |map|
open_cdata_file("combining_class_map") do |input|
buffer = "x" * 6
buffer.force_encoding(Encoding::US_ASCII)
cc_buffer = "x" * 2
cc_buffer.force_encoding(Encoding::US_ASCII)
while input.read(6, buffer)
map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
end
end
}
end
|
.read_conditional_casings(filename) ⇒ Object
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
# File 'lib/unicode_utils/read_cdata.rb', line 68
def self.read_conditional_casings(filename)
Hash.new.tap { |cp_map|
open_cdata_file(filename) do |input|
input.each_line { |line|
line.chomp!
record = line.split(";")
cp = record[0].to_i(16)
mapping = record[1].split(",").map { |c| c.to_i(16) }
language_id = record[2].empty? ? nil : record[2].to_sym
context = record[3] && record[3].gsub('_', '')
casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
(cp_map[cp] ||= {})[language_id] = casing
}
end
}
end
|
.read_hexdigit_map(filename) ⇒ Object
Read a map whose keys are codepoints (6 hexgdigits, converted to integer) and whose values are single hexdigits (converted to integer).
102
103
104
105
106
107
108
109
110
111
112
113
114
|
# File 'lib/unicode_utils/read_cdata.rb', line 102
def self.read_hexdigit_map(filename)
Hash.new.tap { |map|
open_cdata_file(filename) do |input|
buffer = "x" * 6
buffer.force_encoding(Encoding::US_ASCII)
val_buffer = "x"
val_buffer.force_encoding(Encoding::US_ASCII)
while input.read(6, buffer)
map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
end
end
}
end
|
.read_multivalued_map(filename) ⇒ Object
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
# File 'lib/unicode_utils/read_cdata.rb', line 39
def self.read_multivalued_map(filename)
Hash.new.tap { |map|
open_cdata_file(filename) do |input|
buffer = "x" * 6
buffer.force_encoding(Encoding::US_ASCII)
while input.read(6, buffer)
cp = buffer.to_i(16)
mapping = []
while input.read(6, buffer).getbyte(0) != 120
mapping << buffer.to_i(16)
end
map[cp] = mapping
end
end
}
end
|
.read_names(filename) ⇒ Object
56
57
58
59
60
61
62
63
64
65
66
|
# File 'lib/unicode_utils/read_cdata.rb', line 56
def self.read_names(filename)
Hash.new.tap { |map|
open_cdata_file(filename) do |input|
buffer = "x" * 6
buffer.force_encoding(Encoding::US_ASCII)
while input.read(6, buffer)
map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
end
end
}
end
|
.word_break?(cs, i) ⇒ Boolean
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
# File 'lib/unicode_utils/each_word.rb', line 41
def self.word_break?(cs, i)
cs_i = cs[i]
i1 = i + 1
cs_i1 = cs[i1]
if cs_i == 0x0 && cs_i1 == 0x1
return false
end
if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
return true
end
if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
return true
end
i0 = i
c = nil
loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
ci0 = c
if ci0 == 0x6 && cs_i1 == 0x6
return false
end
i2 = i1 + 1
loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
return false
end
i_1 = i0 - 1
loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
return false
end
if ci0 == 0xA && cs_i1 == 0xA
return false
end
if ci0 == 0x6 && cs_i1 == 0xA
return false
end
if ci0 == 0xA && cs_i1 == 0x6
return false
end
if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
return false
end
if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
return false
end
if ci0 == 0x5 && cs_i1 == 0x5
return false
end
if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
return false
end
if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
return false
end
cs_i1 != 0x3 && cs_i1 != 0x4
end
|