Module: Mojinizer

Included in:
String
Defined in:
lib/mojinizer/version.rb,
lib/mojinizer/detection.rb,
lib/mojinizer/conversion.rb,
lib/mojinizer/romaji_tables.rb

Constant Summary collapse

VERSION =
"0.2.2"
KANA_TO_ROM =
{
  ""=>"a", ""=>"i", ""=>"u", ""=>"e",""=>"o",
  ""=>"a", ""=>"i", ""=>"u", ""=>"e",""=>"o",
  ""=>"ka", ""=>"ki", ""=>"ku", ""=>"ke", ""=>"ko",
  ""=>"ka", ""=>"ki", ""=>"ku", ""=>"ke", ""=>"ko",
  ""=>"ga", ""=>"gi", ""=>"gu", ""=>"ge", ""=>"go",
  ""=>"ga", ""=>"gi", ""=>"gu", ""=>"ge", ""=>"go",
  ""=>"sa", ""=>"shi",""=>"su", ""=>"se", ""=>"so",
  ""=>"sa", ""=>"shi",""=>"su", ""=>"se", ""=>"so",
  ""=>"za", ""=>"ji", ""=>"zu", ""=>"ze", ""=>"zo",
  ""=>"za", ""=>"ji", ""=>"zu", ""=>"ze", ""=>"zo",
  ""=>"ta", ""=>"chi",""=>"tsu",""=>"te", ""=>"to",
  ""=>"ta", ""=>"chi",""=>"tsu",""=>"te", ""=>"to",
  ""=>"da", ""=>"dji",""=>"dzu",""=>"de", ""=>"do",
  ""=>"da", ""=>"dji",""=>"dzu",""=>"de", ""=>"do",
  ""=>"na", ""=>"ni", ""=>"nu", ""=>"ne", ""=>"no",
  ""=>"na", ""=>"ni", ""=>"nu", ""=>"ne", ""=>"no",
  ""=>"ha", ""=>"hi", ""=>"fu", ""=>"he", ""=>"ho",
  ""=>"ha", ""=>"hi", ""=>"fu", ""=>"he", ""=>"ho",
  ""=>"ba", ""=>"bi", ""=>"bu", ""=>"be", ""=>"bo",
  ""=>"ba", ""=>"bi", ""=>"bu", ""=>"be", ""=>"bo",
  ""=>"pa", ""=>"pi", ""=>"pu", ""=>"pe", ""=>"po",
  ""=>"pa", ""=>"pi", ""=>"pu", ""=>"pe", ""=>"po",
  ""=>"ma", ""=>"mi", ""=>"mu", ""=>"me", ""=>"mo",
  ""=>"ma", ""=>"mi", ""=>"mu", ""=>"me", ""=>"mo",
  ""=>"ya", ""=>"yu", ""=>"yo",
  ""=>"ya", ""=>"yu", ""=>"yo",
  ""=>"ra", ""=>"ri", ""=>"ru",""=>"re",""=>"ro",
  ""=>"ra", ""=>"ri", ""=>"ru",""=>"re",""=>"ro",
  ""=>"wa", ""=>"wi", ""=>"we", ""=>"wo", ""=>"nn",
  ""=>"wa", ""=>"wi", ""=>"we", ""=>"wo", ""=>"nn",
  ""=>"xa", ""=>"xi", ""=>"xu", ""=>"xe", ""=>"xo",
  ""=>"xa", ""=>"xi", ""=>"xu", ""=>"xe", ""=>"xo",
  ""=>"xtsu",""=>"xya", ""=>"xyu", ""=>"xyo",
  ""=>"xtsu",""=>"xya", ""=>"xyu", ""=>"xyo",
  ""=>"vu", ""=>"xka",""=>"ga",""=>"xwa",
  ""=>"xwa",
  ""=>"-", ""=>"-", ""=>'"', ""=>"'", ""=>",", ""=>".",
  ""=>":", " " => " ", "" => "@", "" => "(", "" => ")",
  " " => " "
}
KANA_TO_ROM2 =
{
  "てぃ" => "ti", "でぃ" => "di"
}
ROM_TO_KATA1 =

1 character romaji patterns

{
  "a"=>"", "i"=>"", "u"=>"", "e"=>"", "o"=>"", "-"=>""
}
ROM_TO_KATA2 =

2 character romaji patterns

{
  "xa"=>"", "xi"=>"", "xu"=>"", "xe"=>"", "xo"=>"",
  "ka"=>"", "ki"=>"", "ku"=>"", "ke"=>"", "ko"=>"",
  "ca"=>"", "cu"=>"", "co"=>"",
  "ga"=>"", "gi"=>"", "gu"=>"", "ge"=>"", "go"=>"",
  "sa"=>"", "si"=>"", "su"=>"", "se"=>"", "so"=>"",
  "za"=>"", "zi"=>"", "zu"=>"", "ze"=>"", "zo"=>"",
  "ja"=>"ジャ","ji"=>"", "ju"=>"ジュ","je"=>"ジェ","jo"=>"ジョ",
  "ta"=>"", "ti"=>"", "tu"=>"", "te"=>"", "to"=>"",
  "da"=>"", "di"=>"", "du"=>"", "de"=>"", "do"=>"",
  "na"=>"", "ni"=>"", "nu"=>"", "ne"=>"", "no"=>"",
  "ha"=>"", "hi"=>"", "hu"=>"", "he"=>"", "ho"=>"",
  "ba"=>"", "bi"=>"", "bu"=>"", "be"=>"", "bo"=>"",
  "pa"=>"", "pi"=>"", "pu"=>"", "pe"=>"", "po"=>"",
  "va"=>"ヴァ","vi"=>"ヴィ","vu"=>"", "ve"=>"ヴェ","vo"=>"ヴォ",
  "fa"=>"ファ","fi"=>"フィ","fu"=>"", "fe"=>"フェ","fo"=>"フォ",
  "ma"=>"", "mi"=>"", "mu"=>"", "me"=>"", "mo"=>"",
  "ya"=>"", "yi"=>"", "yu"=>"", "ye"=>"イェ", "yo"=>"",
  "ra"=>"", "ri"=>"", "ru"=>"", "re"=>"", "ro"=>"",
  "la"=>"", "li"=>"", "lu"=>"", "le"=>"", "lo"=>"",
  "wa"=>"", "wi"=>"", "wu"=>"", "we"=>"", "wo"=>"",
  "nn"=>""
}
ROM_TO_KATA3 =

3 character romaji patterns

{
  "tsu"=>"",
  "xka"=>"", "xke"=>"",
  "xwa"=>"", "xtsu"=>"",   "xya"=>"",  "xyu"=>"",  "xyo"=>"",
  "kya"=>"キャ", "kyi"=>"キィ", "kyu"=>"キュ", "kye"=>"キェ", "kyo"=>"キョ",
  "gya"=>"ギャ", "gyi"=>"ギィ", "gyu"=>"ギュ", "gye"=>"ギェ", "gyo"=>"ギョ",
  "sya"=>"シャ", "syi"=>"シィ", "syu"=>"シュ", "sye"=>"シェ", "syo"=>"ショ",
  "sha"=>"シャ", "shi"=>"",  "shu"=>"シュ", "she"=>"シェ", "sho"=>"ショ",
  "zya"=>"ジャ", "zyi"=>"ジィ", "zyu"=>"ジュ", "zye"=>"ジェ", "zyo"=>"ジョ",
  "jya"=>"ジャ", "jyi"=>"ジィ", "jyu"=>"ジュ", "jye"=>"ジェ", "jyo"=>"ジョ",
  "tya"=>"チャ", "tyi"=>"チィ", "tyu"=>"チュ", "tye"=>"チェ", "tyo"=>"チョ",
  "cya"=>"チャ", "cyi"=>"チィ", "cyu"=>"チュ", "cye"=>"チェ", "cyo"=>"チョ",
  "cha"=>"チャ", "chi"=>"",  "chu"=>"チュ", "che"=>"チェ", "cho"=>"チョ",
  "tha"=>"テャ", "thi"=>"ティ", "thu"=>"テュ", "the"=>"テェ", "tho"=>"テョ",
  "dya"=>"ヂャ", "dyi"=>"ヂィ", "dyu"=>"ヂュ", "dye"=>"ヂェ", "dyo"=>"ヂョ",
  "dha"=>"デャ", "dhi"=>"ディ", "dhu"=>"デュ", "dhe"=>"デェ", "dho"=>"デョ",
  "nya"=>"ニャ", "nyi"=>"ニィ", "nyu"=>"ニュ", "nye"=>"ニェ", "nyo"=>"ニョ",
  "hya"=>"ヒャ", "hyi"=>"ヒィ", "hyu"=>"ヒュ", "hye"=>"ヒェ", "hyo"=>"ヒョ",
  "bya"=>"ビャ", "byi"=>"ビィ", "byu"=>"ビュ", "bye"=>"ビェ", "byo"=>"ビョ",
  "pya"=>"ピャ", "pyi"=>"ピィ", "pyu"=>"ピュ", "pye"=>"ピェ", "pyo"=>"ピョ",
  "mya"=>"ミャ", "myi"=>"ミィ", "myu"=>"ミュ", "mye"=>"ミェ", "myo"=>"ミョ",
  "rya"=>"リャ", "ryi"=>"リィ", "ryu"=>"リュ", "rye"=>"リェ", "ryo"=>"リョ",
  "lya"=>"リャ", "lyi"=>"リィ", "lyu"=>"リュ", "lye"=>"リェ", "lyo"=>"リョ"
}

Instance Method Summary collapse

Instance Method Details

#ascii_zenkaku?Boolean

Returns:

  • (Boolean)


27
28
29
# File 'lib/mojinizer/detection.rb', line 27

def ascii_zenkaku?
  moji_type?(Moji::ZEN_ALNUM | Moji::ZEN_ASYMBOL)
end

#contains_ascii_zenkaku?Boolean

Returns:

  • (Boolean)


64
65
66
# File 'lib/mojinizer/detection.rb', line 64

def contains_ascii_zenkaku?
  contains_moji_type?(Moji::ZEN_ALNUM | Moji::ZEN_ASYMBOL)
end

#contains_hankaku?Boolean

Returns:

  • (Boolean)


56
57
58
# File 'lib/mojinizer/detection.rb', line 56

def contains_hankaku?
  contains_moji_type?(Moji::HAN_KATA | Moji::HAN_JSYMBOL)
end

#contains_hiragana?Boolean

Returns:

  • (Boolean)


40
41
42
# File 'lib/mojinizer/detection.rb', line 40

def contains_hiragana?
  contains_moji_type?(Moji::HIRA)
end

#contains_japanese?Boolean

Returns:

  • (Boolean)


68
69
70
# File 'lib/mojinizer/detection.rb', line 68

def contains_japanese?
  contains_moji_type?(Moji::ZEN | Moji::JSYMBOL | Moji::HAN_KATA)
end

#contains_kana?Boolean

Returns:

  • (Boolean)


44
45
46
# File 'lib/mojinizer/detection.rb', line 44

def contains_kana?
  contains_moji_type?(Moji::KANA)
end

#contains_kanji?Boolean

Returns:

  • (Boolean)


52
53
54
# File 'lib/mojinizer/detection.rb', line 52

def contains_kanji?
  contains_moji_type?(Moji::KANJI)
end

#contains_katakana?Boolean

Returns:

  • (Boolean)


48
49
50
# File 'lib/mojinizer/detection.rb', line 48

def contains_katakana?
  contains_moji_type?(Moji::KATA)
end

#contains_moji_type?(type) ⇒ Boolean

Returns:

  • (Boolean)


72
73
74
75
# File 'lib/mojinizer/detection.rb', line 72

def contains_moji_type?(type)
  self.each_char { |c| return true if Moji.type?(c, type) }
  return false
end

#contains_zenkaku?Boolean

Returns:

  • (Boolean)


60
61
62
# File 'lib/mojinizer/detection.rb', line 60

def contains_zenkaku?
  contains_moji_type?(Moji::ZEN)
end

#han_to_zenObject



76
77
78
# File 'lib/mojinizer/conversion.rb', line 76

def han_to_zen
  Moji.han_to_zen(self)
end

#hankaku?Boolean

Returns:

  • (Boolean)


19
20
21
# File 'lib/mojinizer/detection.rb', line 19

def hankaku?
  moji_type?(Moji::HAN_KATA | Moji::HAN_JSYMBOL)
end

#hira_to_kataObject



68
69
70
# File 'lib/mojinizer/conversion.rb', line 68

def hira_to_kata
  Moji.hira_to_kata(self)
end

#hiraganaObject



60
61
62
# File 'lib/mojinizer/conversion.rb', line 60

def hiragana
  self.roma_to_kata.kata_to_hira
end

#hiragana?Boolean

Returns:

  • (Boolean)


3
4
5
# File 'lib/mojinizer/detection.rb', line 3

def hiragana?
  moji_type?(Moji::HIRA)
end

#japanese?Boolean

Returns:

  • (Boolean)


31
32
33
# File 'lib/mojinizer/detection.rb', line 31

def japanese?
  moji_type?(Moji::ZEN | Moji::JSYMBOL | Moji::HAN_KATA)
end

#kana?Boolean

Returns:

  • (Boolean)


11
12
13
# File 'lib/mojinizer/detection.rb', line 11

def kana?
  return (hiragana? || katakana?)
end

#kanji?Boolean

Returns:

  • (Boolean)


15
16
17
# File 'lib/mojinizer/detection.rb', line 15

def kanji?
  moji_type?(Moji::KANJI)
end

#kata_to_hiraObject



72
73
74
# File 'lib/mojinizer/conversion.rb', line 72

def kata_to_hira
  Moji.kata_to_hira(self)
end

#katakanaObject



64
65
66
# File 'lib/mojinizer/conversion.rb', line 64

def katakana
  self.hira_to_kata.roma_to_kata
end

#katakana?Boolean

Returns:

  • (Boolean)


7
8
9
# File 'lib/mojinizer/detection.rb', line 7

def katakana?
  moji_type?(Moji::KATA)
end

#moji_type?(type) ⇒ Boolean

Returns:

  • (Boolean)


35
36
37
38
# File 'lib/mojinizer/detection.rb', line 35

def moji_type?(type)
  self.each_char { |c| return false unless Moji.type?(c, type) }
  return true
end

#normalize_zen_hanObject



84
85
86
# File 'lib/mojinizer/conversion.rb', line 84

def normalize_zen_han
  Moji.normalize_zen_han(self)
end

#roma_to_kataObject



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/mojinizer/conversion.rb', line 88

def roma_to_kata

  result=""
  word_buffer=[]
  chars=self.each_char.collect{|c| c}
  loop do
    case word_buffer.size
      ##### When 0 characters in the buffer
    when 0 then
      if chars.size > 0
        word_buffer.push(chars.shift)
      else
        return result
      end
      ##### Patterns with 1 roman character
    when 1 then
      if word_buffer[0] =~ /[aiueo-]/
        result += ROM_TO_KATA1[word_buffer[0]]
        word_buffer = [] # a-->ア
      elsif word_buffer[0] =~ /[xkcgszjtdnhbpvfmyrlw']/
        if chars.size > 0
          word_buffer.push(chars.shift)
        else
          return result + (word_buffer[0].gsub(/n/,""))
        end
      else
        result += word_buffer.shift
      end
      ##### Patterns with 2 roman characters
    when 2 then
      if ROM_TO_KATA2.key?(word_buffer.join)
        result += ROM_TO_KATA2[word_buffer.join]
        word_buffer = []
      elsif word_buffer.join =~ /([kgszjtcdnhbpmrl]y)|([stcd]h)|ts|(x[wytk])/ # goto 3
        if chars.size > 0
          # Consume next letter from source array
          word_buffer.push(chars.shift)
        else
          return result + (word_buffer.join.gsub(/n/,""))
        end
      elsif word_buffer.join == "n'"
        result += ""
        word_buffer.shift(2) # n'--> ン
      elsif word_buffer[0] == "n"
        result += ""
        word_buffer.shift # nk-->ンk
      elsif word_buffer[0] == word_buffer[1]
        result += ""
        word_buffer.shift # kk-->ッk
      else
        result += word_buffer.shift;
      end
      ##### Patterns with 3 roman characters
    when 3 then
      if ROM_TO_KATA3.key?(word_buffer.join)
        result += ROM_TO_KATA3[word_buffer.join]
        word_buffer=[]
      elsif word_buffer[0] == "n"
        result += ""
        word_buffer.shift
      else
        result += word_buffer.shift
      end
    end
  end
end

#romajiObject



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/mojinizer/conversion.rb', line 4

def romaji
  s=""
  self.each_char do |c|
    if (KANA_TO_ROM.key?(c))
      s += KANA_TO_ROM[c]
    else
      s += c
    end
  end

  s=s.gsub(/(k)([aiueo])(")/,'g\2').gsub(/(s)([aiueo])(")/,'z\2').gsub(/(t)([aiueo])(")/,'d\2')
  s=s.gsub(/(h)([aiueo])(")/,'b\2').gsub(/([fh])([aiueo])(')/,'p\2').gsub(/u"/,'vu') # [半]濁点゛゜
  #---------------------------------------------------------
  s=s.gsub(/\s(xtsu)?\s/,'xtsu')                            # Remove spaces before/after hanging 'っ'
  #---------------------------------------------------------
  sw=s;
  while nil!=sw.gsub!(/(xtsu)([ckgszjtdhfbpmyrwnv])/,'\2\2') do; s=sw; end # ッカ-->xtsuka-->kka
  #---------------------------------------------------------
  # Compound Phoneme Pattern Rollbacks
  # NB: Uses regex backrefs like "\1y\3" where \1 = 1st capture grp, y='y' and \3 = 3rd capture grp
  #---------------------------------------------------------
  s=s.gsub(/( +x)(.*)/,'x\2')                               # Avoid hanging chisaii moji due to leading spaces
  s=s.gsub(/(ch)(ixy)([aueo])/,'\1\3')                      # チョ-->chixyo-->cho
  s=s.gsub(/([kgszjtdnhfbpmr])(ixy)([auo])/,'\1y\3')        # キャ-->kixya-->kya
  s=s.gsub(/([kgszjtdnhfbpmr])(ix)([ie])/,'\1y\3')          # キィ-->kixi-->kyi
  #---------------------------------------------------------
  s=s.gsub(/(sh)(y)([aueo])/,'\1\3')                        # シュ-->shyu-->shu
  s=s.gsub(/(j)(y)([aueo])/,'\1\3')                         # ジュ-->jyu-->ju
  #---------------------------------------------------------
  s=s.gsub(/([td])(exy)([aueo])/,'\1h\3')                   # テャ-->texya-->tha
  s=s.gsub(/([td])(ex)([ie])/,'\1\3')                       # ティ-->texi-->ti
  s=s.gsub(/([td])(oxu)/,'\1oo')                            # ドゥ-->toxu-->too
  s=s.gsub(/(tsu)(x)([aiueo])/,'ts\3')                      # ツァ-->tsuxa-->tsa
  s=s.gsub(/([d])(oxy)/,'\1o\'y')                           # ドュ-->doxyu-->doyu
  #---------------------------------------------------------
  s=s.gsub(/(vux)([aieo])/ ,'v\2')                          # ヴァヴィヴェヴォ, ヴァ-->vuxa-->va
  s=s.gsub(/(vuxy)([aueo])/ ,'vy\2')                        # ヴュ-->vuxyu-->vyu
  s=s.gsub(/(ixe)/ ,'iye')                                  # イェ-->ixe-->iye
  s=s.gsub(/(hoxe)/ ,'howe')                                # ホェ-->hoxe-->howe
  s=s.gsub(/(fux)([aieo])/ ,'f\2')                          # ファフィフェフォ, ファ-->fuxa-->fa
  s=s.gsub(/(fuxy)([aueo])/,'fy\2')                         # フュ-->fuxyu-->fyu
  s=s.gsub(/(ux)([ieo])/, 'w\2')                            # ウァウィウェ, ウァ-->uxa-->wa
  #---------------------------------------------------------
  s=s.strip.gsub(/(xtsu)$/,'h!')                            # Recombine hanging 'っ' followed by EOL
  s=s.gsub(/([aiueo]?)(\-)/, '\1\1')                        # Replace boubiki chars and double preceding vowel
  #---------------------------------------------------------
  # Cleanup specifically for source strings that contain spaces!
  s=s.gsub(/( +)([^a-z|A-z])/, '\2')                        # Remove spaces before any non-alphabetical char
  s=s.gsub(/(n')/,'n')                                      # ン-->nn-->n
  s=s.gsub(/(nn)/,'n')                                      # ン-->nn-->n
  s=s.gsub(/( n)[^a-z|A-Z]?$/,'n')                          # Fix "n" appearing as separate word
  s=s.gsub(/\s{2,}/, ' ')                                   # Remove duplicate spaces!
  #---------------------------------------------------------
  return s
end

#zen_to_hanObject



80
81
82
# File 'lib/mojinizer/conversion.rb', line 80

def zen_to_han
  Moji.zen_to_han(self)
end

#zenkaku?Boolean

Returns:

  • (Boolean)


23
24
25
# File 'lib/mojinizer/detection.rb', line 23

def zenkaku?
  moji_type?(Moji::ZEN)
end