Class: Pinyin

Inherits:
Object
  • Object
show all
Defined in:
lib/chinese_pinyin.rb

Constant Summary collapse

TONE_MARK =
{
  a: %w(ā á ǎ à a),
  o: %w(ō ó ǒ ò o),
  e: %w(ē é ě è e),
  i: %w(ī í ǐ ì i),
  u: %w(ū ú ǔ ù u),
  v: %w(ǖ ǘ ǚ ǜ ü)
}

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.ruby2Object

Returns the value of attribute ruby2.



19
20
21
# File 'lib/chinese_pinyin.rb', line 19

def ruby2
  @ruby2
end

.tableObject

Returns the value of attribute table.



18
19
20
# File 'lib/chinese_pinyin.rb', line 18

def table
  @table
end

Class Method Details

.init_tableObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/chinese_pinyin.rb', line 21

def init_table
  return if @table

  # Ruby 2.0以后默认即为UTF-8编码,使用新的码表以提升效率
  @ruby2  = !!(RUBY_VERSION =~ /^2/)
  datfile = @ruby2 ? 'pinyin-utf8.dat' : 'Mandarin.dat'
  @table  = {}

  File.open(File.dirname(__FILE__) + "/../data/#{datfile}") do |file|
    while line = file.gets
      key, value  = line.split(' ', 2)
      @table[key] = value
    end
  end
end

.init_word_tableObject



37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/chinese_pinyin.rb', line 37

def init_word_table
  return if @words_table

  @words_table = {}

  if ENV["WORDS_FILE"]
    File.open(ENV["WORDS_FILE"]) do |file|
      while line = file.gets
        key, value        = line.sub("\n", "").split('|', 2)
        @words_table[key] = value
      end
    end
  end
end

.translate(chars, options = {}) ⇒ Object Also known as: t



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/chinese_pinyin.rb', line 52

def translate(chars, options={})
  splitter  = options.fetch(:splitter, ' ')
  tonemarks = options.fetch(:tonemarks, false)
  tone      = options.fetch(:tone, false || tonemarks)
  camel     = options.fetch(:camelcase, false)

  init_word_table
  results = @words_table[chars]
  if results
    results = results.split
    results.map!(&:downcase)
    results.map!(&:capitalize) if camel
    results.map! { |x| (48..57).include?(x[-1].ord) ? x.chop! : x } unless tone

    return results.join(splitter)
  end

  init_table
  results    = []
  is_english = false

  chars.scan(/./).each do |char|
    key = @ruby2 ? char : sprintf("%X", char.unpack("U").first)

    if @table[key]
      results << splitter if is_english

      is_english = false
      pinyin     = @table[key].chomp.split(' ', 2)[0]

      pinyin.downcase! unless @ruby2
      pinyin.chop! unless tone
      pinyin.capitalize! if camel
      if tonemarks
        tone_index = pinyin[-1].to_i
        pinyin = pinyin[0...-1]
        %w(a o e i u v).each { |v|
          break if pinyin.tr! v, TONE_MARK[v.to_sym][tone_index - 1]
        }
      end
      if block_given?
        results << (yield pinyin, results.size)
      else
        results << pinyin
        results << splitter
      end
    else
      if char =~ /[a-zA-Z0-9]/
        results << char
      elsif results.last != splitter
        results << splitter
      else
      end
      is_english = true
    end
  end
  results.join('').chomp(splitter)
end