Module: ZhongwenTools::Romanization::Pinyin

Defined in:
lib/zhongwen_tools/romanization/pinyin.rb

Overview

Public: methods to convert, detect and split pinyin or

pyn (pinyin with numbers, e.g. hao3).

Class Method Summary collapse

Class Method Details

.add_hyphens_to_pyn(str) ⇒ Object



103
104
105
106
107
108
109
# File 'lib/zhongwen_tools/romanization/pinyin.rb', line 103

def self.add_hyphens_to_pyn(str)
  results = str.split(' ').map do |s|
    split_pyn(s).join('-')
  end

  results.join(' ')
end

.py?(str) ⇒ Boolean

Public: checks if a string is pinyin.

http://en.wikipedia.org/wiki/Pinyin

Examples

py?('nǐ hǎo')
# => true

Returns Boolean.

Returns:

  • (Boolean)


73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/zhongwen_tools/romanization/pinyin.rb', line 73

def self.py?(str)
  if str[Regex.only_tones].nil? && str[/[1-5]/].nil?
    pyn?(str)
  else
    # TODO: py regex does not include capitals with tones.
    # NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng"

    regex = /(#{ Regex.punc }|#{ Regex.py }|#{ Regex.py_syllabic_nasals }|[\s\-])/
    str = str.gsub('ngu', 'n-gu')
    Caps.downcase(str).gsub(regex, '').strip == ''
  end
end

.pyn?(str) ⇒ Boolean

Public: checks if a string is pinyin.

Examples

pyn?('pin1-yin1')
# => true

Returns Boolean.

Returns:

  • (Boolean)


93
94
95
96
97
98
99
100
101
# File 'lib/zhongwen_tools/romanization/pinyin.rb', line 93

def self.pyn?(str)
  # FIXME: use strip_punctuation method
  normalized_str = Caps.downcase(str.gsub(Regex.punc, '').gsub(/[\s\-]/, ''))
  pyn_arr = split_pyn(normalized_str).map { |p| p }
  pyn_arr << normalized_str if pyn_arr.size == 0 && PYN_SYLLABIC_NASALS.include?(normalized_str.gsub(/[1-5]/, ''))

  pyn_matches_properly?(pyn_arr, normalized_str) &&
    are_all_pyn_syllables_complete?(pyn_arr)
end

.split_py(str) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/zhongwen_tools/romanization/pinyin.rb', line 47

def self.split_py(str)
  words = str.split(' ')

  words.flat_map do |word|
    word, is_capitalized = normalize_pinyin(word)
    word = normalize_n_g(word)
    word = normalize_n(word)
    result = word.split(/['\-]/).flatten.map do |x|
      find_py(x)
    end

    # NOTE: Special Case split_py('wányìr')   # => ['wán', 'yì', 'r']
    result << 'r' unless word[/(.*[^#{ Regex.py_tones['e'] }.])(r)$/].nil?

    recapitalize(result.flatten, is_capitalized)
  end
end

.split_pyn(str) ⇒ Object



40
41
42
43
44
45
# File 'lib/zhongwen_tools/romanization/pinyin.rb', line 40

def self.split_pyn(str)
  # FIXME: ignore punctuation
  regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
  # NOTE: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
  str.scan(regex).map { |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
end