Class: JapaneseString

Inherits:
UnicodeString show all
Defined in:
lib/unicode_madness/japanese_string.rb

Constant Summary collapse

KANJI_CLASS =

A string that can be used in a regular expression character class to match any kanji character. (Example: /[#{KANJI_CLASS}]/)

"#{UCSCodepoint.new(0x4e00)}-#{UCSCodepoint.new(0x9fbf)}" +
"#{UCSCodepoint.new(0x3400)}-#{UCSCodepoint.new(0x4dbf)}" +
"#{UCSCodepoint.new(0x20000)}-#{UCSCodepoint.new(0x2a6df)}"
KATAKANA_CLASS =

A string that can be used in a regular expression character class to match any katakana character. (Example: /[#{KATAKANA_CLASS}]/)

"#{UCSCodepoint.new(0x30a2)}-#{UCSCodepoint.new(0x30ff)}"
KANA_CLASS =

A string that can be used in a regular expression character class to match any hiragana or katakana character. (Example: /[#{KANA_CLASS}]/)

"#{UCSCodepoint.new(0x3040)}-#{UCSCodepoint.new(0x30ff)}" +
"#{UCSCodepoint.new(0x31f0)}-#{UCSCodepoint.new(0x31ff)}"
KATAKANA_TO_HIRAGANA =

Table for converting katakana to their equivalent hiragana.

{
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => ''
}
UNVOICED_KANA =

Table for converting voiced hiragana and katakana to their unvoiced forms.

{
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => ''
}
VOICED_KANA =

Table for converting unvoiced hiragana and katakana to their voiced forms.

{
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => '', '' => '', '' => '', '' => '',
  '' => '', '' => '', '' => ''
}
KANA_ROMAJI_MAP =

Maps kana to their romanized equivalents. Also maps full-width Latin characters to their ASCII equivalents.

{
	"" => "a", "" => "i", "" => "u", "" => "e", "" => "o", "" => "ka",
	"" => "ki", "" => "ku", "" => "ke", "" => "ko", "" => "sa",
	"" => "shi", "" => "su", "" => "se", "" => "so", "" => "ta",
	"" => "chi", "" => "tsu", "" => "te", "" => "to", "" => "na",
	"" => "ni", "" => "nu", "" => "ne", "" => "no", "" => "ha",
	"" => "hi", "" => "fu", "" => "he", "" => "ho", "" => "ma",
	"" => "mi", "" => "mu", "" => "me", "" => "mo", "" => "ya",
	"" => "yu", "" => "yo", "" => "ra", "" => "ri", "" => "ru",
	"" => "re", "" => "ro", "" => "wa", "" => "wi", "" => "we",
	"" => "wo", "" => "n", "" => "ga", "" => "gi", "" => "gu",
	"" => "ge", "" => "go", "" => "za", "" => "ji", "" => "zu",
	"" => "ze", "" => "zo", "" => "da", "" => "ji", "" => "zu",
	"" => "de", "" => "do", "" => "ba", "" => "bi", "" => "bu",
	"" => "be", "" => "bo", "" => "pa", "" => "pi", "" => "pu",
	"" => "pe", "" => "po", "" => "a", "" => "i", "" => "u", "" => "e",
	"" => "o", "" => "ka", "" => "ki", "" => "ku", "" => "ke",
	"" => "ko", "" => "sa", "" => "shi", "" => "su", "" => "se",
	"" => "so", "" => "ta", "" => "chi", "" => "tsu", "" => "te",
	"" => "to", "" => "na", "" => "ni", "" => "nu", "" => "ne",
	"" => "no", "" => "ha", "" => "hi", "" => "fu", "" => "he",
	"" => "ho", "" => "ma", "" => "mi", "" => "mu", "" => "me",
	"" => "mo", "" => "ya", "" => "yu", "" => "yo", "" => "ra",
	"" => "ri", "" => "ru", "" => "re", "" => "ro", "" => "wa",
	"" => "wi", "" => "we", "" => "wo", "" => "n", "" => "ga",
	"" => "gi", "" => "gu", "" => "ge", "" => "go", "" => "za",
	"" => "ji", "" => "zu", "" => "ze", "" => "zo", "" => "da",
	"" => "ji", "" => "zu", "" => "de", "" => "do", "" => "ba",
	"" => "bi", "" => "bu", "" => "be", "" => "bo", "" => "pa",
	"" => "pi", "" => "pu", "" => "pe", "" => "po", "" => "vu",
	"" => " ", "" => "0", "" => "1", "" => "2", "" => "3", "" => "4",
	"" => "5", "" => "6", "" => "7", "" => "8", "" => "9", "" => "!",
	"" => "\"", "" => "#", "" => "\$", "" => "%", "" => "&", "" => "'",
	"" => "(", "" => ")", "" => "*", "" => "+", "" => ".", "" => "-",
	"" => ".", "" => "/", "" => ":", "" => ";", "" => "<", "" => "=",
	"" => ">", "" => "?", "" => "\@", "" => "A", "" => "B", "" => "C",
	"" => "D", "" => "E", "" => "F", "" => "G", "" => "H", "" => "I",
	"" => "J", "" => "K", "" => "L", "" => "M", "" => "N", "" => "O",
	"" => "P", "" => "Q", "" => "R", "" => "S", "" => "T", "" => "U",
	"" => "V", "" => "W", "" => "X", "" => "Y", "" => "Z", "" => "[",
	"" => "\\", "" => "]", "" => "^", "_" => "_", "" => "`", "" => "a",
	"" => "b", "" => "c", "" => "d", "" => "e", "" => "f", "" => "g",
	"" => "h", "" => "i", "" => "j", "" => "k", "" => "l", "" => "m",
	"" => "n", "" => "o", "" => "p", "" => "q", "" => "r", "" => "s",
	"" => "t", "" => "u", "" => "v", "" => "w", "" => "x", "" => "y",
	"" => "z", "" => "{", "" => "|", "" => "}", "" => "-"
}

Instance Method Summary collapse

Methods inherited from UnicodeString

#codepoint, #index_to_uindex, #kana?, #kanji?, #uindex, #uindex_to_index, #uslice, #wide_latin?

Instance Method Details

#kana_sort_keyObject

Creates a 7-bit-safe string that can be used to sort strings containing kana and/or English text.



244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# File 'lib/unicode_madness/japanese_string.rb', line 244

def kana_sort_key
	key = ''
	downcase.split('').each do |ch|
		if ch =~ /[0-9]/
			ch[0] -= 15 # produces ! through *
		elsif ch =~ /[a-z]/
			ch[0] -= 54 # produces + through E
		elsif KANA_SORT_MAP.has_key?(ch)
			ch = KANA_SORT_MAP[ch]
			if ch.kind_of?(Numeric)
				tmp = ' '
				tmp[0] = ch + 70
				ch = tmp
			else
				redo
			end
		else
			next
		end
		key += ch
	end
	key
end

#romanize(warnings = true) ⇒ Object

Creates a new string by romanizing the kana in this string. Full-width Latin characters are also converted to their ASCII equivalents. If warnings is true (the default), a message is printed on STDERR if an un-romanizable character is encountered.



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/unicode_madness/japanese_string.rb', line 159

def romanize(warnings = true)
	romanized = String.new(self)
  
	# Convert dipthongs. This gsub-mania is probably insanely inefficient.
	romanized.gsub!('きゃ', 'kya'); romanized.gsub!('キャ', 'kya')
	romanized.gsub!('きゅ', 'kyu'); romanized.gsub!('キュ', 'kyu')
	romanized.gsub!('きょ', 'kyo'); romanized.gsub!('キョ', 'kyo')
  romanized.gsub!('しゃ', 'sha'); romanized.gsub!('シャ', 'sha')
	romanized.gsub!('しゅ', 'shu'); romanized.gsub!('シュ', 'shu')
	romanized.gsub!('しぇ', 'she'); romanized.gsub!('シェ', 'she')
	romanized.gsub!('しょ', 'sho'); romanized.gsub!('ショ', 'sho')
  romanized.gsub!('ちゃ', 'cha'); romanized.gsub!('チャ', 'cha')
	romanized.gsub!('ちゅ', 'chu'); romanized.gsub!('チュ', 'chu')
	romanized.gsub!('ちょ', 'cho'); romanized.gsub!('チョ', 'cho')
  romanized.gsub!('にゃ', 'nya'); romanized.gsub!('ニャ', 'nya')
	romanized.gsub!('にゅ', 'nyu'); romanized.gsub!('ニュ', 'nyu')
	romanized.gsub!('にょ', 'nyo'); romanized.gsub!('ニョ', 'nyo')
  romanized.gsub!('ひゃ', 'hya'); romanized.gsub!('ヒャ', 'hya')
	romanized.gsub!('ひゅ', 'hyu'); romanized.gsub!('ヒュ', 'hyu')
	romanized.gsub!('ひょ', 'hyo'); romanized.gsub!('ヒョ', 'hyo')
  romanized.gsub!('みゃ', 'mya'); romanized.gsub!('ミャ', 'mya')
	romanized.gsub!('みゅ', 'myu'); romanized.gsub!('ミュ', 'myu')
	romanized.gsub!('みょ', 'myo'); romanized.gsub!('ミョ', 'myo')
  romanized.gsub!('りゃ', 'rya'); romanized.gsub!('リャ', 'rya')
	romanized.gsub!('りゅ', 'ryu'); romanized.gsub!('リュ', 'ryu')
	romanized.gsub!('りょ', 'ryo'); romanized.gsub!('リョ', 'ryo')
  romanized.gsub!('ぎゃ', 'gya'); romanized.gsub!('ギャ', 'gya')
	romanized.gsub!('ぎゅ', 'gyu'); romanized.gsub!('ギュ', 'gyu')
	romanized.gsub!('ぎょ', 'gyo'); romanized.gsub!('ギョ', 'gyo')
  romanized.gsub!('じゃ', 'ja'); romanized.gsub!('ジャ', 'ja')
	romanized.gsub!('じゅ', 'ju'); romanized.gsub!('ジュ', 'ju')
	romanized.gsub!('じょ', 'jo'); romanized.gsub!('ジョ', 'jo')
  romanized.gsub!('ぢゃ', 'ja'); romanized.gsub!('ヂャ', 'ja')
	romanized.gsub!('ぢゅ', 'ju'); romanized.gsub!('ヂュ', 'ju')
	romanized.gsub!('ぢょ', 'jo'); romanized.gsub!('ヂョ', 'jo')
  romanized.gsub!('びゃ', 'bya'); romanized.gsub!('ビャ', 'bya')
	romanized.gsub!('びゅ', 'byu'); romanized.gsub!('ビュ', 'byu')
	romanized.gsub!('びょ', 'byo'); romanized.gsub!('ビョ', 'byo')
  romanized.gsub!('ぴゃ', 'pya'); romanized.gsub!('ピャ', 'pya')
	romanized.gsub!('ぴゅ', 'pyu'); romanized.gsub!('ピュ', 'pyu')
	romanized.gsub!('ぴょ', 'pyo'); romanized.gsub!('ピョ', 'pyo')
  
	# Convert extended kana.
	romanized.gsub!('ふぁ', 'fa'); romanized.gsub!('でぃ', 'ti')
  romanized.gsub!('イェ', 'ye'); romanized.gsub!('ウィ', 'wi')
	romanized.gsub!('ウェ', 'we'); romanized.gsub!('ウォ', 'wo')
	romanized.gsub!('ヴァ', 'va'); romanized.gsub!('ヴィ', 'vi')
	romanized.gsub!('ヴゥ', 'vu'); romanized.gsub!('ヴェ', 've')
	romanized.gsub!('ヴォ', 'vo'); romanized.gsub!('シェ', 'she')
	romanized.gsub!('ジェ', 'je'); romanized.gsub!('チェ', 'che')
	romanized.gsub!('ティ', 'ti'); romanized.gsub!('トゥ', 'tu')
	romanized.gsub!('チュ', 'tyu'); romanized.gsub!('ディ', 'di')
	romanized.gsub!('ドゥ', 'du'); romanized.gsub!('デュ', 'dyu')
	romanized.gsub!('ツァ', 'tsa'); romanized.gsub!('ツェ', 'tse')
	romanized.gsub!('ツォ', 'tso'); romanized.gsub!('ファ', 'fa')
	romanized.gsub!('フィ', 'fi'); romanized.gsub!('フェ', 'fe')
	romanized.gsub!('フォ', 'fo'); romanized.gsub!('フュ', 'fyu')
	romanized.gsub!('スィ', 'si'); romanized.gsub!('ゲィ', 'gei')
	romanized.gsub!('ワァ', 'waa'); romanized.gsub!('ツィ', 'tsui')
	romanized.gsub!('シィ', 'shii'); romanized.gsub!('ウァ', 'ua')
	romanized.gsub!('ヴュ', 'vyu'); romanized.gsub!('クォ', 'quo')
	romanized.gsub!('テュ', 'tu'); romanized.gsub!('グィ', 'gui')
	romanized.gsub!('クェ', 'que'); romanized.gsub!('ビィ', 'bii')
	romanized.gsub!('ツィ', 'tsi'); romanized.gsub!('ズィ', 'zi')
	romanized.gsub!('リィ', 'rii'); romanized.gsub!('テュ', 'tu')
  
	# Do simple conversions.
	chars = romanized.split('')
	chars.each_with_index do |ch,i|
		chars[i] = KANA_ROMAJI_MAP[ch] if KANA_ROMAJI_MAP.has_key?(ch)
		if chars[i] !~ /\A[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz&\d\.\-ッっー ]+\Z/
			STDERR.puts "Couldn't romanize #{ch} in #{self}" if warnings
		end
	end
	romanized = chars.join('')
  
	# Convert letter-doublers (small tsu and katakana dash).
	romanized.gsub!(/[ッっ](.)/, '\1\1')
	romanized.gsub!(/(.)ー/, '\1\1')
  
 romanized
end

#to_hiraganaObject

Returns a new string with this string’s katakana replaced with equivalent hiragana.



115
116
117
118
119
120
121
122
123
124
125
# File 'lib/unicode_madness/japanese_string.rb', line 115

def to_hiragana
  new_str = ''
  split('').each do |ch|
    if KATAKANA_TO_HIRAGANA.has_key?(ch)
      new_str += KATAKANA_TO_HIRAGANA[ch]
    else
      new_str += ch
    end
  end
  self.class.new(new_str)
end

#unvoice_kanaObject

Returns a new string with this string’s voiced hiragana and katakana replaced with their unvoiced forms.



129
130
131
132
133
134
135
136
137
138
139
# File 'lib/unicode_madness/japanese_string.rb', line 129

def unvoice_kana
  new_str = ''
  split('').each do |ch|
    if UNVOICED_KANA.has_key?(ch)
      new_str += UNVOICED_KANA[ch]
    else
      new_str += ch
    end
  end
  self.class.new(new_str)
end

#voice_kanaObject

Returns a new string with this string’s unvoiced hiragana and katakana replaced with their voiced forms.



143
144
145
146
147
148
149
150
151
152
153
# File 'lib/unicode_madness/japanese_string.rb', line 143

def voice_kana
  new_str = ''
  split('').each do |ch|
    if VOICED_KANA.has_key?(ch)
      new_str += VOICED_KANA[ch]
    else
      new_str += ch
    end
  end
  self.class.new(new_str)
end