Module: Neologdish::Normalizer

Defined in:
lib/neologdish/normalizer.rb,
lib/neologdish/normalizer/version.rb

Overview

A Japanese text normalizer module according to the neologd convention.

Constant Summary collapse

VERSION =
'0.2.0'

Class Method Summary collapse

Class Method Details

.normalize(str, override_conversion_map = {}) ⇒ Object

Normalize the given text.



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/neologdish/normalizer.rb', line 118

def normalize(str, override_conversion_map = {})
  conversion_map = CONVERSION_MAP.merge(override_conversion_map)

  squeezee = ''
  prev_latin = false
  whitespace_encountered = false
  dakuon_handakuon_possible = nil
  normalized = str.chars.map do |c|
    prefix = ''
    c = conversion_map[c] || c

    # normalize the Half-width kana to full-width
    if dakuon_handakuon_possible
      if (["\u309b", "\u3099", "\uff9e"].include?(c) && (k = DAKUON_KANA_MAP[dakuon_handakuon_possible])) ||
         (["\u309c", "\u309a", "\uff9f"].include?(c) && (k = HANDAKUON_KANA_MAP[dakuon_handakuon_possible]))
        c = ''
        prefix = k
      else
        prefix = dakuon_handakuon_possible
      end
    end

    if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c])
      c = encountered_half_width_kana
    end

    dakuon_handakuon_possible = nil
    if DAKUON_HANDAKUON_POSSIBLES[c]
      dakuon_handakuon_possible = c
      c = ''
    end

    # squash consecutive special characters (space or long-vowel)
    if [' ', ''].include?(c)
      if squeezee == c
        c = ''
      else
        squeezee = c
      end
    else
      squeezee = ''
    end

    # remove the white space character in the middle of non-latin characters
    is_latin = LATIN_MAP[c] || false
    if c == ' '
      whitespace_encountered = prev_latin
      c = ''
    else
      prefix = ' ' if is_latin && whitespace_encountered
      whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side
    end
    prev_latin = is_latin

    prefix + c
  end.join + (dakuon_handakuon_possible || '')

  normalized.strip
end