Class: HanziConverter

Inherits:
Object
  • Object
show all
Defined in:
lib/hanzi-converter.rb

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.dataObject

Returns the value of attribute data.



5
6
7
# File 'lib/hanzi-converter.rb', line 5

def data
  @data
end

Class Method Details

.load_dataObject



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/hanzi-converter.rb', line 7

def load_data
  return if @data
  @data = []

  file_path = File.expand_path('../../lib/data/cedict_ts.u8', __FILE__)
  File.open(file_path).each_line do |line|
    next if line.start_with?('#')
    line = line.force_encoding('utf-8')

    # CC-CEDICT format:
    # Traditional Simplified [pin1 yin1] /English equivalent 1/equivalent 2/
    line_data = {}
    line_data[:traditional] = line[0, line.index(' ')]

    line = line[line.index(' ') + 1, line.length]
    line_data[:simplified] = line[0, line.index(' ')]

    line = line[line.index('['), line.length]
    line_data[:pinyin] = line[1, line.index(']') - 1].downcase

    line = line[line.index('/'), line.rindex('/')]
    line_data[:english] = line[1, line.rindex('/') - 1]

    @data << line_data
  end

end

.to_pinyin(text, options = {}) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/hanzi-converter.rb', line 35

def to_pinyin(text, options={})
  load_data if @data.nil?

  result = ''
  pos = 0

  loop do
    char = text[pos]
    break if !char

    if char.ord < 0x4E00 || char.ord > 0x9FFF
      # it's not a chinese character.
      result << char
      pos += 1
    else
      # it's a chinese character. start by trying to find a long word match,
      # and if it fails, all the way down to a single hanzi.
      match = nil
      match_length = 0
      4.downto(1) do |length|
        match = find_match(text[pos, length])
        match_length = length
        break if match
      end

      if match
        result << match[:pinyin].gsub("\s", '')
        pos += match_length
        next
      else
        # if we're still here, we didn't find a match at all.
        result << char
        pos += 1
      end
    end
  end

  result
end