Module: Neologdish::Normalizer
- Defined in:
- lib/neologdish/normalizer.rb,
lib/neologdish/normalizer/version.rb
Overview
A Japanese text normalizer module according to the neologd convention.
Constant Summary collapse
- VERSION =
'0.2.0'
Class Method Summary collapse
-
.normalize(str, override_conversion_map = {}) ⇒ Object
Normalize the given text.
Class Method Details
.normalize(str, override_conversion_map = {}) ⇒ Object
Normalize the given text.
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
# File 'lib/neologdish/normalizer.rb', line 118 def normalize(str, override_conversion_map = {}) conversion_map = CONVERSION_MAP.merge(override_conversion_map) squeezee = '' prev_latin = false whitespace_encountered = false dakuon_handakuon_possible = nil normalized = str.chars.map do |c| prefix = '' c = conversion_map[c] || c # normalize the Half-width kana to full-width if dakuon_handakuon_possible if (["\u309b", "\u3099", "\uff9e"].include?(c) && (k = DAKUON_KANA_MAP[dakuon_handakuon_possible])) || (["\u309c", "\u309a", "\uff9f"].include?(c) && (k = HANDAKUON_KANA_MAP[dakuon_handakuon_possible])) c = '' prefix = k else prefix = dakuon_handakuon_possible end end if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c]) c = encountered_half_width_kana end dakuon_handakuon_possible = nil if DAKUON_HANDAKUON_POSSIBLES[c] dakuon_handakuon_possible = c c = '' end # squash consecutive special characters (space or long-vowel) if [' ', 'ー'].include?(c) if squeezee == c c = '' else squeezee = c end else squeezee = '' end # remove the white space character in the middle of non-latin characters is_latin = LATIN_MAP[c] || false if c == ' ' whitespace_encountered = prev_latin c = '' else prefix = ' ' if is_latin && whitespace_encountered whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side end prev_latin = is_latin prefix + c end.join + (dakuon_handakuon_possible || '') normalized.strip end |