Module: UnicodeUtils::Impl

Defined in:
lib/unicode_utils/nfc.rb,
lib/unicode_utils/each_word.rb,
lib/unicode_utils/read_cdata.rb,
lib/unicode_utils/conditional_casing.rb,
lib/unicode_utils/canonical_decomposition.rb,
lib/unicode_utils/compatibility_decomposition.rb,
lib/unicode_utils/hangul_syllable_decomposition.rb

Overview

:nodoc:

Defined Under Namespace

Modules: NFC Classes: AfterIConditionalCasing, AfterSoftDottedConditionalCasing, BeforeDotConditionalCasing, ConditionalCasing, FinalSigmaConditionalCasing, MoreAboveConditionalCasing, NotBeforeDotConditionalCasing

Constant Summary collapse

COMPOSITION_EXCLUSION_SET =
Impl.read_codepoint_set("composition_exclusion_set")
CANONICAL_COMPOSITION_MAP =
Hash.new.tap do |m|
  CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
    if decomp.length == 2
      (m[decomp[0]] ||= {})[decomp[1]] = comp
    end
  }
end
LANGS_WITH_RULES =
{:tr => true, :lt => true, :az => true}
CONDITIONAL_UPCASE_MAP =
read_conditional_casings("cond_uc_map")
CONDITIONAL_DOWNCASE_MAP =
read_conditional_casings("cond_lc_map")
CONDITIONAL_TITLECASE_MAP =
read_conditional_casings("cond_tc_map")

Class Method Summary collapse

Class Method Details

.append_hangul_syllable_decomposition(str, s) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/unicode_utils/hangul_syllable_decomposition.rb', line 20

def self.append_hangul_syllable_decomposition(str, s)
  # constants
  sbase = 0xAC00
  lbase = 0x1100
  vbase = 0x1161
  tbase = 0x11A7
  scount = 11172
  lcount = 19
  vcount = 21
  tcount = 28
  ncount = vcount * tcount

  sindex = s - sbase
  if 0 <= sindex && sindex < scount
    l = lbase + sindex / ncount
    v = vbase + (sindex % ncount) / tcount
    t = tbase + sindex % tcount
    str << l << v
    str << t if t != tbase
  else
    str << s
  end
end

.append_recursive_canonical_decomposition_mapping(str, mapping) ⇒ Object



48
49
50
51
52
53
54
55
56
57
# File 'lib/unicode_utils/canonical_decomposition.rb', line 48

def self.append_recursive_canonical_decomposition_mapping(str, mapping)
  mapping.each { |cp|
    mapping_ = CANONICAL_DECOMPOSITION_MAP[cp]
    if mapping_
      append_recursive_canonical_decomposition_mapping(str, mapping_)
    else
      str << cp
    end
  }
end

.append_recursive_compatibility_decomposition_mapping(str, cp) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
# File 'lib/unicode_utils/compatibility_decomposition.rb', line 41

def self.append_recursive_compatibility_decomposition_mapping(str, cp)
  mapping = COMPATIBILITY_DECOMPOSITION_MAP[cp]
  mapping ||= CANONICAL_DECOMPOSITION_MAP[cp]
  if mapping
    mapping.each { |c|
      append_recursive_compatibility_decomposition_mapping(str, c)
    }
  else
    str << cp
  end
end

.composition(str) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/unicode_utils/nfc.rb', line 46

def self.composition(str)
  ### constants for hangul composition ###
  sbase = 0xAC00
  lbase = 0x1100
  vbase = 0x1161
  tbase = 0x11A7
  lcount = 19
  vcount = 21
  tcount = 28
  ncount = vcount * tcount
  scount = lcount * ncount
  ########################################

  String.new.force_encoding(str.encoding).tap do |res|
    last_starter = nil
    uncomposable_non_starters = []
    str.each_codepoint { |cp|
      if COMBINING_CLASS_MAP[cp] == 0 # starter?
        combined = false
        if last_starter && uncomposable_non_starters.empty?
          ### hangul ###
          lindex = last_starter - lbase
          if 0 <= lindex && lindex < lcount
            vindex = cp - vbase
            if 0 <= vindex && vindex <= vcount
              last_starter =
                sbase + (lindex * vcount + vindex) * tcount
              combined = true
            end
          end
          unless combined
            sindex = last_starter - sbase
            if 0 <= sindex && sindex < scount && (sindex % tcount) == 0
              tindex = cp - tbase
              if 0 <= tindex && tindex < tcount
                last_starter += tindex
                combined = true
              end
            end
          end
          ##############
          unless combined
            map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
            composition = map && map[cp]
            if composition && Impl::NFC.primary_composite?(composition)
              last_starter = composition
              combined = true
            end
          end
        end
        unless combined
          res << last_starter if last_starter
          uncomposable_non_starters.each { |nc| res << nc }
          uncomposable_non_starters.clear
          last_starter = cp
        end
      else
        last_non_starter = uncomposable_non_starters.last
        if last_non_starter && Impl::NFC.blocked?(last_non_starter, cp)
          uncomposable_non_starters << cp
        else
          map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
          composition = map && map[cp]
          if composition && Impl::NFC.primary_composite?(composition)
            last_starter = composition
          else
            uncomposable_non_starters << cp
          end
        end
      end
    }
    res << last_starter if last_starter
    uncomposable_non_starters.each { |nc| res << nc }
  end
end

.conditional_downcase_mapping(cp, str, pos, language_id) ⇒ Object



140
141
142
143
144
145
146
147
148
# File 'lib/unicode_utils/conditional_casing.rb', line 140

def self.conditional_downcase_mapping(cp, str, pos, language_id)
  lang_map = CONDITIONAL_DOWNCASE_MAP[cp]
  if lang_map
    casing = lang_map[language_id] || lang_map[nil]
    if casing && casing.context_match?(str, pos)
      casing.mapping
    end
  end
end

.conditional_titlecase_mapping(cp, str, pos, language_id) ⇒ Object



150
151
152
153
154
155
156
157
158
# File 'lib/unicode_utils/conditional_casing.rb', line 150

def self.conditional_titlecase_mapping(cp, str, pos, language_id)
  lang_map = CONDITIONAL_TITLECASE_MAP[cp]
  if lang_map
    casing = lang_map[language_id] || lang_map[nil]
    if casing && casing.context_match?(str, pos)
      casing.mapping
    end
  end
end

.conditional_upcase_mapping(cp, str, pos, language_id) ⇒ Object



130
131
132
133
134
135
136
137
138
# File 'lib/unicode_utils/conditional_casing.rb', line 130

def self.conditional_upcase_mapping(cp, str, pos, language_id)
  lang_map = CONDITIONAL_UPCASE_MAP[cp]
  if lang_map
    casing = lang_map[language_id] || lang_map[nil]
    if casing && casing.context_match?(str, pos)
      casing.mapping
    end
  end
end

.open_cdata_file(filename, &block) ⇒ Object



11
12
13
# File 'lib/unicode_utils/read_cdata.rb', line 11

def self.open_cdata_file(filename, &block)
  File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
end

.put_into_canonical_order(str) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/unicode_utils/canonical_decomposition.rb', line 59

def self.put_into_canonical_order(str)
  reorder_needed = false
  last_cp = nil
  last_cc = nil
  str.each_codepoint { |cp|
    cc = COMBINING_CLASS_MAP[cp]
    if last_cp && cc != 0 && last_cc > cc
      reorder_needed = true
      break
    end
    last_cp = cp
    last_cc = cc
  }
  return str unless reorder_needed
  res = String.new.force_encoding(str.encoding)
  last_cp = nil
  last_cc = nil
  str.each_codepoint { |cp|
    cc = COMBINING_CLASS_MAP[cp]
    if last_cp
      if cc != 0 && last_cc > cc
        res << cp
        cp = nil
        cc = nil
      end
      res << last_cp
    end
    last_cp = cp
    last_cc = cc
  }
  res << last_cp if last_cp
  put_into_canonical_order(res)
end

.read_codepoint_map(filename) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/unicode_utils/read_cdata.rb', line 27

def self.read_codepoint_map(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.read(6, buffer).to_i(16)
      end
    end
  }
end

.read_codepoint_set(filename) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
# File 'lib/unicode_utils/read_cdata.rb', line 15

def self.read_codepoint_set(filename)
  Hash.new.tap { |set|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        set[buffer.to_i(16)] = true
      end
    end
  }
end

.read_combining_class_mapObject



85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/unicode_utils/read_cdata.rb', line 85

def self.read_combining_class_map
  Hash.new.tap { |map|
    open_cdata_file("combining_class_map") do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      cc_buffer = "x" * 2
      cc_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
      end
    end
  }
end

.read_conditional_casings(filename) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/unicode_utils/read_cdata.rb', line 68

def self.read_conditional_casings(filename)
  Hash.new.tap { |cp_map|
    open_cdata_file(filename) do |input|
      input.each_line { |line|
        line.chomp!
        record = line.split(";")
        cp = record[0].to_i(16)
        mapping = record[1].split(",").map { |c| c.to_i(16) }
        language_id = record[2].empty? ? nil : record[2].to_sym
        context = record[3] && record[3].gsub('_', '')
        casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
        (cp_map[cp] ||= {})[language_id] = casing
      }
    end
  }
end

.read_hexdigit_map(filename) ⇒ Object

Read a map whose keys are codepoints (6 hexgdigits, converted to integer) and whose values are single hexdigits (converted to integer).



102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/unicode_utils/read_cdata.rb', line 102

def self.read_hexdigit_map(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      val_buffer = "x"
      val_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
      end
    end
  }
end

.read_multivalued_map(filename) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/unicode_utils/read_cdata.rb', line 39

def self.read_multivalued_map(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        cp = buffer.to_i(16)
        mapping = []
        while input.read(6, buffer).getbyte(0) != 120
          mapping << buffer.to_i(16)
        end
        map[cp] = mapping
      end
    end
  }
end

.read_names(filename) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
# File 'lib/unicode_utils/read_cdata.rb', line 56

def self.read_names(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
      end
    end
  }
end

.word_break?(cs, i) ⇒ Boolean

Returns:

  • (Boolean)


41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/unicode_utils/each_word.rb', line 41

def self.word_break?(cs, i)
  # wb3
  cs_i = cs[i]
  i1 = i + 1
  cs_i1 = cs[i1]
  if cs_i == 0x0 && cs_i1 == 0x1
    return false
  end
  # wb3a
  if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
    return true
  end
  # wb3b
  if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
    return true
  end
  # wb5
  i0 = i
  # inline skip_l
  c = nil
  loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
  ci0 = c
  if ci0 == 0x6 && cs_i1 == 0x6
    return false
  end
  # wb6
  i2 = i1 + 1
  # inline skip_r
  loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
  if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
    return false
  end
  # wb7
  i_1 = i0 - 1
  # inline skip_l
  loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
  if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
    return false
  end
  # wb8
  if ci0 == 0xA && cs_i1 == 0xA
    return false
  end
  # wb9
  if ci0 == 0x6 && cs_i1 == 0xA
    return false
  end
  # wb10
  if ci0 == 0xA && cs_i1 == 0x6
    return false
  end
  # wb11
  if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
    return false
  end
  # wb12
  if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
    return false
  end
  # wb13
  if ci0 == 0x5 && cs_i1 == 0x5
    return false
  end
  # wb13a
  if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
    return false
  end
  # wb13b
  if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
    return false
  end
  # break unless next char is Extend/Format
  cs_i1 != 0x3 && cs_i1 != 0x4
end