Module: Mojinizer
- Included in:
- String
- Defined in:
- lib/mojinizer/version.rb,
lib/mojinizer/detection.rb,
lib/mojinizer/conversion.rb,
lib/mojinizer/romaji_tables.rb
Constant Summary collapse
- VERSION =
"0.2.2"
- KANA_TO_ROM =
{ "ア"=>"a", "イ"=>"i", "ウ"=>"u", "エ"=>"e","オ"=>"o", "あ"=>"a", "い"=>"i", "う"=>"u", "え"=>"e","お"=>"o", "カ"=>"ka", "キ"=>"ki", "ク"=>"ku", "ケ"=>"ke", "コ"=>"ko", "か"=>"ka", "き"=>"ki", "く"=>"ku", "け"=>"ke", "こ"=>"ko", "ガ"=>"ga", "ギ"=>"gi", "グ"=>"gu", "ゲ"=>"ge", "ゴ"=>"go", "が"=>"ga", "ぎ"=>"gi", "ぐ"=>"gu", "げ"=>"ge", "ご"=>"go", "サ"=>"sa", "シ"=>"shi","ス"=>"su", "セ"=>"se", "ソ"=>"so", "さ"=>"sa", "し"=>"shi","す"=>"su", "せ"=>"se", "そ"=>"so", "ザ"=>"za", "ジ"=>"ji", "ズ"=>"zu", "ゼ"=>"ze", "ゾ"=>"zo", "ざ"=>"za", "じ"=>"ji", "ず"=>"zu", "ぜ"=>"ze", "ぞ"=>"zo", "タ"=>"ta", "チ"=>"chi","ツ"=>"tsu","テ"=>"te", "ト"=>"to", "た"=>"ta", "ち"=>"chi","つ"=>"tsu","て"=>"te", "と"=>"to", "ダ"=>"da", "ヂ"=>"dji","ヅ"=>"dzu","デ"=>"de", "ド"=>"do", "だ"=>"da", "ぢ"=>"dji","づ"=>"dzu","で"=>"de", "ど"=>"do", "ナ"=>"na", "ニ"=>"ni", "ヌ"=>"nu", "ネ"=>"ne", "ノ"=>"no", "な"=>"na", "に"=>"ni", "ぬ"=>"nu", "ね"=>"ne", "の"=>"no", "ハ"=>"ha", "ヒ"=>"hi", "フ"=>"fu", "ヘ"=>"he", "ホ"=>"ho", "は"=>"ha", "ひ"=>"hi", "ふ"=>"fu", "へ"=>"he", "ほ"=>"ho", "バ"=>"ba", "ビ"=>"bi", "ブ"=>"bu", "ベ"=>"be", "ボ"=>"bo", "ば"=>"ba", "び"=>"bi", "ぶ"=>"bu", "べ"=>"be", "ぼ"=>"bo", "パ"=>"pa", "ピ"=>"pi", "プ"=>"pu", "ペ"=>"pe", "ポ"=>"po", "ぱ"=>"pa", "ぴ"=>"pi", "ぷ"=>"pu", "ぺ"=>"pe", "ぽ"=>"po", "マ"=>"ma", "ミ"=>"mi", "ム"=>"mu", "メ"=>"me", "モ"=>"mo", "ま"=>"ma", "み"=>"mi", "む"=>"mu", "め"=>"me", "も"=>"mo", "ヤ"=>"ya", "ユ"=>"yu", "ヨ"=>"yo", "や"=>"ya", "ゆ"=>"yu", "よ"=>"yo", "ラ"=>"ra", "リ"=>"ri", "ル"=>"ru","レ"=>"re","ロ"=>"ro", "ら"=>"ra", "り"=>"ri", "る"=>"ru","れ"=>"re","ろ"=>"ro", "ワ"=>"wa", "ヰ"=>"wi", "ヱ"=>"we", "ヲ"=>"wo", "ン"=>"nn", "わ"=>"wa", "ゐ"=>"wi", "ゑ"=>"we", "を"=>"wo", "ん"=>"nn", "ァ"=>"xa", "ィ"=>"xi", "ゥ"=>"xu", "ェ"=>"xe", "ォ"=>"xo", "ぁ"=>"xa", "ぃ"=>"xi", "ぅ"=>"xu", "ぇ"=>"xe", "ぉ"=>"xo", "ッ"=>"xtsu","ャ"=>"xya", "ュ"=>"xyu", "ョ"=>"xyo", "っ"=>"xtsu","ゃ"=>"xya", "ゅ"=>"xyu", "ょ"=>"xyo", "ヴ"=>"vu", "ヵ"=>"xka","ヶ"=>"ga","ヮ"=>"xwa", "ゎ"=>"xwa", "ー"=>"-", "−"=>"-", "゛"=>'"', "゜"=>"'", "、"=>",", "。"=>".", ":"=>":", " " => " ", "@" => "@", "(" => "(", ")" => ")", " " => " " }
- KANA_TO_ROM2 =
{ "てぃ" => "ti", "でぃ" => "di" }
- ROM_TO_KATA1 =
1 character romaji patterns
{ "a"=>"ア", "i"=>"イ", "u"=>"ウ", "e"=>"エ", "o"=>"オ", "-"=>"ー" }
- ROM_TO_KATA2 =
2 character romaji patterns
{ "xa"=>"ァ", "xi"=>"ィ", "xu"=>"ゥ", "xe"=>"ェ", "xo"=>"ォ", "ka"=>"カ", "ki"=>"キ", "ku"=>"ク", "ke"=>"ケ", "ko"=>"コ", "ca"=>"カ", "cu"=>"ク", "co"=>"コ", "ga"=>"ガ", "gi"=>"ギ", "gu"=>"グ", "ge"=>"ゲ", "go"=>"ゴ", "sa"=>"サ", "si"=>"シ", "su"=>"ス", "se"=>"セ", "so"=>"ソ", "za"=>"ザ", "zi"=>"ジ", "zu"=>"ズ", "ze"=>"ゼ", "zo"=>"ゾ", "ja"=>"ジャ","ji"=>"ジ", "ju"=>"ジュ","je"=>"ジェ","jo"=>"ジョ", "ta"=>"タ", "ti"=>"チ", "tu"=>"ツ", "te"=>"テ", "to"=>"ト", "da"=>"ダ", "di"=>"ヂ", "du"=>"ヅ", "de"=>"デ", "do"=>"ド", "na"=>"ナ", "ni"=>"ニ", "nu"=>"ヌ", "ne"=>"ネ", "no"=>"ノ", "ha"=>"ハ", "hi"=>"ヒ", "hu"=>"フ", "he"=>"ヘ", "ho"=>"ホ", "ba"=>"バ", "bi"=>"ビ", "bu"=>"ブ", "be"=>"ベ", "bo"=>"ボ", "pa"=>"パ", "pi"=>"ピ", "pu"=>"プ", "pe"=>"ペ", "po"=>"ポ", "va"=>"ヴァ","vi"=>"ヴィ","vu"=>"ヴ", "ve"=>"ヴェ","vo"=>"ヴォ", "fa"=>"ファ","fi"=>"フィ","fu"=>"フ", "fe"=>"フェ","fo"=>"フォ", "ma"=>"マ", "mi"=>"ミ", "mu"=>"ム", "me"=>"メ", "mo"=>"モ", "ya"=>"ヤ", "yi"=>"イ", "yu"=>"ユ", "ye"=>"イェ", "yo"=>"ヨ", "ra"=>"ラ", "ri"=>"リ", "ru"=>"ル", "re"=>"レ", "ro"=>"ロ", "la"=>"ラ", "li"=>"リ", "lu"=>"ル", "le"=>"レ", "lo"=>"ロ", "wa"=>"ワ", "wi"=>"ヰ", "wu"=>"ウ", "we"=>"ヱ", "wo"=>"ヲ", "nn"=>"ン" }
- ROM_TO_KATA3 =
3 character romaji patterns
{ "tsu"=>"ツ", "xka"=>"ヵ", "xke"=>"ヶ", "xwa"=>"ヮ", "xtsu"=>"ッ", "xya"=>"ャ", "xyu"=>"ュ", "xyo"=>"ョ", "kya"=>"キャ", "kyi"=>"キィ", "kyu"=>"キュ", "kye"=>"キェ", "kyo"=>"キョ", "gya"=>"ギャ", "gyi"=>"ギィ", "gyu"=>"ギュ", "gye"=>"ギェ", "gyo"=>"ギョ", "sya"=>"シャ", "syi"=>"シィ", "syu"=>"シュ", "sye"=>"シェ", "syo"=>"ショ", "sha"=>"シャ", "shi"=>"シ", "shu"=>"シュ", "she"=>"シェ", "sho"=>"ショ", "zya"=>"ジャ", "zyi"=>"ジィ", "zyu"=>"ジュ", "zye"=>"ジェ", "zyo"=>"ジョ", "jya"=>"ジャ", "jyi"=>"ジィ", "jyu"=>"ジュ", "jye"=>"ジェ", "jyo"=>"ジョ", "tya"=>"チャ", "tyi"=>"チィ", "tyu"=>"チュ", "tye"=>"チェ", "tyo"=>"チョ", "cya"=>"チャ", "cyi"=>"チィ", "cyu"=>"チュ", "cye"=>"チェ", "cyo"=>"チョ", "cha"=>"チャ", "chi"=>"チ", "chu"=>"チュ", "che"=>"チェ", "cho"=>"チョ", "tha"=>"テャ", "thi"=>"ティ", "thu"=>"テュ", "the"=>"テェ", "tho"=>"テョ", "dya"=>"ヂャ", "dyi"=>"ヂィ", "dyu"=>"ヂュ", "dye"=>"ヂェ", "dyo"=>"ヂョ", "dha"=>"デャ", "dhi"=>"ディ", "dhu"=>"デュ", "dhe"=>"デェ", "dho"=>"デョ", "nya"=>"ニャ", "nyi"=>"ニィ", "nyu"=>"ニュ", "nye"=>"ニェ", "nyo"=>"ニョ", "hya"=>"ヒャ", "hyi"=>"ヒィ", "hyu"=>"ヒュ", "hye"=>"ヒェ", "hyo"=>"ヒョ", "bya"=>"ビャ", "byi"=>"ビィ", "byu"=>"ビュ", "bye"=>"ビェ", "byo"=>"ビョ", "pya"=>"ピャ", "pyi"=>"ピィ", "pyu"=>"ピュ", "pye"=>"ピェ", "pyo"=>"ピョ", "mya"=>"ミャ", "myi"=>"ミィ", "myu"=>"ミュ", "mye"=>"ミェ", "myo"=>"ミョ", "rya"=>"リャ", "ryi"=>"リィ", "ryu"=>"リュ", "rye"=>"リェ", "ryo"=>"リョ", "lya"=>"リャ", "lyi"=>"リィ", "lyu"=>"リュ", "lye"=>"リェ", "lyo"=>"リョ" }
Instance Method Summary collapse
- #ascii_zenkaku? ⇒ Boolean
- #contains_ascii_zenkaku? ⇒ Boolean
- #contains_hankaku? ⇒ Boolean
- #contains_hiragana? ⇒ Boolean
- #contains_japanese? ⇒ Boolean
- #contains_kana? ⇒ Boolean
- #contains_kanji? ⇒ Boolean
- #contains_katakana? ⇒ Boolean
- #contains_moji_type?(type) ⇒ Boolean
- #contains_zenkaku? ⇒ Boolean
- #han_to_zen ⇒ Object
- #hankaku? ⇒ Boolean
- #hira_to_kata ⇒ Object
- #hiragana ⇒ Object
- #hiragana? ⇒ Boolean
- #japanese? ⇒ Boolean
- #kana? ⇒ Boolean
- #kanji? ⇒ Boolean
- #kata_to_hira ⇒ Object
- #katakana ⇒ Object
- #katakana? ⇒ Boolean
- #moji_type?(type) ⇒ Boolean
- #normalize_zen_han ⇒ Object
- #roma_to_kata ⇒ Object
- #romaji ⇒ Object
- #zen_to_han ⇒ Object
- #zenkaku? ⇒ Boolean
Instance Method Details
#ascii_zenkaku? ⇒ Boolean
27 28 29 |
# File 'lib/mojinizer/detection.rb', line 27 def ascii_zenkaku? moji_type?(Moji::ZEN_ALNUM | Moji::ZEN_ASYMBOL) end |
#contains_ascii_zenkaku? ⇒ Boolean
64 65 66 |
# File 'lib/mojinizer/detection.rb', line 64 def contains_ascii_zenkaku? contains_moji_type?(Moji::ZEN_ALNUM | Moji::ZEN_ASYMBOL) end |
#contains_hankaku? ⇒ Boolean
56 57 58 |
# File 'lib/mojinizer/detection.rb', line 56 def contains_hankaku? contains_moji_type?(Moji::HAN_KATA | Moji::HAN_JSYMBOL) end |
#contains_hiragana? ⇒ Boolean
40 41 42 |
# File 'lib/mojinizer/detection.rb', line 40 def contains_hiragana? contains_moji_type?(Moji::HIRA) end |
#contains_japanese? ⇒ Boolean
68 69 70 |
# File 'lib/mojinizer/detection.rb', line 68 def contains_japanese? contains_moji_type?(Moji::ZEN | Moji::JSYMBOL | Moji::HAN_KATA) end |
#contains_kana? ⇒ Boolean
44 45 46 |
# File 'lib/mojinizer/detection.rb', line 44 def contains_kana? contains_moji_type?(Moji::KANA) end |
#contains_kanji? ⇒ Boolean
52 53 54 |
# File 'lib/mojinizer/detection.rb', line 52 def contains_kanji? contains_moji_type?(Moji::KANJI) end |
#contains_katakana? ⇒ Boolean
48 49 50 |
# File 'lib/mojinizer/detection.rb', line 48 def contains_katakana? contains_moji_type?(Moji::KATA) end |
#contains_moji_type?(type) ⇒ Boolean
72 73 74 75 |
# File 'lib/mojinizer/detection.rb', line 72 def contains_moji_type?(type) self.each_char { |c| return true if Moji.type?(c, type) } return false end |
#contains_zenkaku? ⇒ Boolean
60 61 62 |
# File 'lib/mojinizer/detection.rb', line 60 def contains_zenkaku? contains_moji_type?(Moji::ZEN) end |
#han_to_zen ⇒ Object
76 77 78 |
# File 'lib/mojinizer/conversion.rb', line 76 def han_to_zen Moji.han_to_zen(self) end |
#hankaku? ⇒ Boolean
19 20 21 |
# File 'lib/mojinizer/detection.rb', line 19 def hankaku? moji_type?(Moji::HAN_KATA | Moji::HAN_JSYMBOL) end |
#hira_to_kata ⇒ Object
68 69 70 |
# File 'lib/mojinizer/conversion.rb', line 68 def hira_to_kata Moji.hira_to_kata(self) end |
#hiragana ⇒ Object
60 61 62 |
# File 'lib/mojinizer/conversion.rb', line 60 def hiragana self.roma_to_kata.kata_to_hira end |
#hiragana? ⇒ Boolean
3 4 5 |
# File 'lib/mojinizer/detection.rb', line 3 def hiragana? moji_type?(Moji::HIRA) end |
#japanese? ⇒ Boolean
31 32 33 |
# File 'lib/mojinizer/detection.rb', line 31 def japanese? moji_type?(Moji::ZEN | Moji::JSYMBOL | Moji::HAN_KATA) end |
#kana? ⇒ Boolean
11 12 13 |
# File 'lib/mojinizer/detection.rb', line 11 def kana? return (hiragana? || katakana?) end |
#kanji? ⇒ Boolean
15 16 17 |
# File 'lib/mojinizer/detection.rb', line 15 def kanji? moji_type?(Moji::KANJI) end |
#kata_to_hira ⇒ Object
72 73 74 |
# File 'lib/mojinizer/conversion.rb', line 72 def kata_to_hira Moji.kata_to_hira(self) end |
#katakana ⇒ Object
64 65 66 |
# File 'lib/mojinizer/conversion.rb', line 64 def katakana self.hira_to_kata.roma_to_kata end |
#katakana? ⇒ Boolean
7 8 9 |
# File 'lib/mojinizer/detection.rb', line 7 def katakana? moji_type?(Moji::KATA) end |
#moji_type?(type) ⇒ Boolean
35 36 37 38 |
# File 'lib/mojinizer/detection.rb', line 35 def moji_type?(type) self.each_char { |c| return false unless Moji.type?(c, type) } return true end |
#normalize_zen_han ⇒ Object
84 85 86 |
# File 'lib/mojinizer/conversion.rb', line 84 def normalize_zen_han Moji.normalize_zen_han(self) end |
#roma_to_kata ⇒ Object
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/mojinizer/conversion.rb', line 88 def roma_to_kata result="" word_buffer=[] chars=self.each_char.collect{|c| c} loop do case word_buffer.size ##### When 0 characters in the buffer when 0 then if chars.size > 0 word_buffer.push(chars.shift) else return result end ##### Patterns with 1 roman character when 1 then if word_buffer[0] =~ /[aiueo-]/ result += ROM_TO_KATA1[word_buffer[0]] word_buffer = [] # a-->ア elsif word_buffer[0] =~ /[xkcgszjtdnhbpvfmyrlw']/ if chars.size > 0 word_buffer.push(chars.shift) else return result + (word_buffer[0].gsub(/n/,"ン")) end else result += word_buffer.shift end ##### Patterns with 2 roman characters when 2 then if ROM_TO_KATA2.key?(word_buffer.join) result += ROM_TO_KATA2[word_buffer.join] word_buffer = [] elsif word_buffer.join =~ /([kgszjtcdnhbpmrl]y)|([stcd]h)|ts|(x[wytk])/ # goto 3 if chars.size > 0 # Consume next letter from source array word_buffer.push(chars.shift) else return result + (word_buffer.join.gsub(/n/,"ン")) end elsif word_buffer.join == "n'" result += "ン" word_buffer.shift(2) # n'--> ン elsif word_buffer[0] == "n" result += "ン" word_buffer.shift # nk-->ンk elsif word_buffer[0] == word_buffer[1] result += "ッ" word_buffer.shift # kk-->ッk else result += word_buffer.shift; end ##### Patterns with 3 roman characters when 3 then if ROM_TO_KATA3.key?(word_buffer.join) result += ROM_TO_KATA3[word_buffer.join] word_buffer=[] elsif word_buffer[0] == "n" result += "ン" word_buffer.shift else result += word_buffer.shift end end end end |
#romaji ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/mojinizer/conversion.rb', line 4 def romaji s="" self.each_char do |c| if (KANA_TO_ROM.key?(c)) s += KANA_TO_ROM[c] else s += c end end s=s.gsub(/(k)([aiueo])(")/,'g\2').gsub(/(s)([aiueo])(")/,'z\2').gsub(/(t)([aiueo])(")/,'d\2') s=s.gsub(/(h)([aiueo])(")/,'b\2').gsub(/([fh])([aiueo])(')/,'p\2').gsub(/u"/,'vu') # [半]濁点゛゜ #--------------------------------------------------------- s=s.gsub(/\s(xtsu)?\s/,'xtsu') # Remove spaces before/after hanging 'っ' #--------------------------------------------------------- sw=s; while nil!=sw.gsub!(/(xtsu)([ckgszjtdhfbpmyrwnv])/,'\2\2') do; s=sw; end # ッカ-->xtsuka-->kka #--------------------------------------------------------- # Compound Phoneme Pattern Rollbacks # NB: Uses regex backrefs like "\1y\3" where \1 = 1st capture grp, y='y' and \3 = 3rd capture grp #--------------------------------------------------------- s=s.gsub(/( +x)(.*)/,'x\2') # Avoid hanging chisaii moji due to leading spaces s=s.gsub(/(ch)(ixy)([aueo])/,'\1\3') # チョ-->chixyo-->cho s=s.gsub(/([kgszjtdnhfbpmr])(ixy)([auo])/,'\1y\3') # キャ-->kixya-->kya s=s.gsub(/([kgszjtdnhfbpmr])(ix)([ie])/,'\1y\3') # キィ-->kixi-->kyi #--------------------------------------------------------- s=s.gsub(/(sh)(y)([aueo])/,'\1\3') # シュ-->shyu-->shu s=s.gsub(/(j)(y)([aueo])/,'\1\3') # ジュ-->jyu-->ju #--------------------------------------------------------- s=s.gsub(/([td])(exy)([aueo])/,'\1h\3') # テャ-->texya-->tha s=s.gsub(/([td])(ex)([ie])/,'\1\3') # ティ-->texi-->ti s=s.gsub(/([td])(oxu)/,'\1oo') # ドゥ-->toxu-->too s=s.gsub(/(tsu)(x)([aiueo])/,'ts\3') # ツァ-->tsuxa-->tsa s=s.gsub(/([d])(oxy)/,'\1o\'y') # ドュ-->doxyu-->doyu #--------------------------------------------------------- s=s.gsub(/(vux)([aieo])/ ,'v\2') # ヴァヴィヴェヴォ, ヴァ-->vuxa-->va s=s.gsub(/(vuxy)([aueo])/ ,'vy\2') # ヴュ-->vuxyu-->vyu s=s.gsub(/(ixe)/ ,'iye') # イェ-->ixe-->iye s=s.gsub(/(hoxe)/ ,'howe') # ホェ-->hoxe-->howe s=s.gsub(/(fux)([aieo])/ ,'f\2') # ファフィフェフォ, ファ-->fuxa-->fa s=s.gsub(/(fuxy)([aueo])/,'fy\2') # フュ-->fuxyu-->fyu s=s.gsub(/(ux)([ieo])/, 'w\2') # ウァウィウェ, ウァ-->uxa-->wa #--------------------------------------------------------- s=s.strip.gsub(/(xtsu)$/,'h!') # Recombine hanging 'っ' followed by EOL s=s.gsub(/([aiueo]?)(\-)/, '\1\1') # Replace boubiki chars and double preceding vowel #--------------------------------------------------------- # Cleanup specifically for source strings that contain spaces! s=s.gsub(/( +)([^a-z|A-z])/, '\2') # Remove spaces before any non-alphabetical char s=s.gsub(/(n')/,'n') # ン-->nn-->n s=s.gsub(/(nn)/,'n') # ン-->nn-->n s=s.gsub(/( n)[^a-z|A-Z]?$/,'n') # Fix "n" appearing as separate word s=s.gsub(/\s{2,}/, ' ') # Remove duplicate spaces! #--------------------------------------------------------- return s end |
#zen_to_han ⇒ Object
80 81 82 |
# File 'lib/mojinizer/conversion.rb', line 80 def zen_to_han Moji.zen_to_han(self) end |
#zenkaku? ⇒ Boolean
23 24 25 |
# File 'lib/mojinizer/detection.rb', line 23 def zenkaku? moji_type?(Moji::ZEN) end |