Module: Neologdish::Normalizer

Defined in:: lib/neologdish/normalizer.rb,
lib/neologdish/normalizer/version.rb

Overview

A Japanese text normalizer module according to the neologd convention.

Constant Summary collapse

VERSION =

'0.2.0'

Class Method Summary collapse

.normalize(str, override_conversion_map = {}) ⇒ Object

Normalize the given text.

Class Method Details

.normalize(str, override_conversion_map = {}) ⇒ `Object`

Normalize the given text.

# File 'lib/neologdish/normalizer.rb', line 118

def normalize(str, override_conversion_map = {})
  conversion_map = CONVERSION_MAP.merge(override_conversion_map)

  squeezee = ''
  prev_latin = false
  whitespace_encountered = false
  dakuon_handakuon_possible = nil
  normalized = str.chars.map do |c|
    prefix = ''
    c = conversion_map[c] || c

    # normalize the Half-width kana to full-width
    if dakuon_handakuon_possible
      if (["\u309b", "\u3099", "\uff9e"].include?(c) && (k = DAKUON_KANA_MAP[dakuon_handakuon_possible])) ||
         (["\u309c", "\u309a", "\uff9f"].include?(c) && (k = HANDAKUON_KANA_MAP[dakuon_handakuon_possible]))
        c = ''
        prefix = k
      else
        prefix = dakuon_handakuon_possible
      end
    end

    if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c])
      c = encountered_half_width_kana
    end

    dakuon_handakuon_possible = nil
    if DAKUON_HANDAKUON_POSSIBLES[c]
      dakuon_handakuon_possible = c
      c = ''
    end

    # squash consecutive special characters (space or long-vowel)
    if [' ', 'ー'].include?(c)
      if squeezee == c
        c = ''
      else
        squeezee = c
      end
    else
      squeezee = ''
    end

    # remove the white space character in the middle of non-latin characters
    is_latin = LATIN_MAP[c] || false
    if c == ' '
      whitespace_encountered = prev_latin
      c = ''
    else
      prefix = ' ' if is_latin && whitespace_encountered
      whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side
    end
    prev_latin = is_latin

    prefix + c
  end.join + (dakuon_handakuon_possible || '')

  normalized.strip
end

Module: Neologdish::Normalizer

Overview

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.normalize(str, override_conversion_map = {}) ⇒ Object

.normalize(str, override_conversion_map = {}) ⇒ `Object`