Class: HanziToPinyin

Inherits:
Object
  • Object
show all
Defined in:
lib/hanzi_to_pinyin.rb

Constant Summary collapse

VERSION =
IO.read File.expand_path("../../VERSION",__FILE__)
@@hanzi_unicode_start =

Unicode中汉字开始点(16进制)

19968
@@hanzi_unicode_end =

Unicode中汉字的结束点

40869
@@letter_upcase_start =

字母(10进制)

65
@@letter_upcase_end =
90
@@letter_downcase_start =
97
@@letter_downcase_end =
122
@@number_unicode_start =

数字(10进制)

48
@@number_unicode_end =
57
@@underline =

下划线(10进制)

95
@@dash =

横线(10进制)

45
@@unicode =

汉字 unicode 编码(16进制)

YAML.load(IO.read File.expand_path("../data/unicode_to_pinyin.yml",__FILE__))
@@py =
::JSON.parse(IO.read File.expand_path("../data/hz2py.json",__FILE__))

Class Method Summary collapse

Class Method Details

.append(values) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
# File 'lib/hanzi_to_pinyin.rb', line 114

def self.append(values)
  if @str.length == 0
    @str << "#{values.join(',')}"
  else
    if @str[-1] == ";"
      @str << "#{values.join(',')}"
    else
      @str << ";#{values.join(',')}"
    end
  end
end

.hanzi_2_pinyin(hanzi) ⇒ Object Also known as: hanzi_to_pinyin

只取首字母



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/hanzi_to_pinyin.rb', line 35

def self.hanzi_2_pinyin(hanzi)
  hanzi = hanzi.force_encoding("utf-8")
  u_str = ''      
  hanzi.each_codepoint { |c|
    if is_hanzi?(c)
      unicode = c.to_s(16).upcase          
      u_str << @@unicode[unicode]
    else
      if c == 45 # -
        u_str << "_"
      else
        u_str << c.chr.downcase
      end          
    end        
  }
  u_str
end

.hanzi_2_py(hanzi) ⇒ Object Also known as: hanzi_to_py

多音字,分隔 字字之间;分隔,字母原样保留

查理Smith => "cha,zha;li;Smith"
郭轶 => "guo;yi,die"
我们 => "wo;men"
宗志强 => "zong;zhi;qiang,jiang"


62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/hanzi_to_pinyin.rb', line 62

def self.hanzi_2_py(hanzi)
  hanzi = hanzi.to_s.force_encoding("utf-8")
  @str = ''
  index = 0
  hanzi.each_char do |hz|
    if is_hanzi?(hz.ord)
      values = @@py[hz]
      append(values)
    else
      if @str.length == 0
        @str << hz.chr
      else
        if @str[-1] == ";"
          @str << hz.chr
        elsif @str[-1] =~ /[a-z]/i
          if is_hanzi?(hanzi[index-1].ord)
            @str << ";#{hz.chr}"
          else
            @str << hz.chr
          end
        else
          @str << ";#{hz.chr}"
        end
      end
    end
    index += 1
  end
  @str
end

.hanzi_2_url(hanzi) ⇒ Object Also known as: hanzi_to_url

汉字转化为安全的 url



97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/hanzi_to_pinyin.rb', line 97

def self.hanzi_2_url(hanzi)
  hanzi = hanzi.to_s.force_encoding("utf-8")
  arr = []
  hanzi.each_char do |hz|
    if is_hanzi?(hz.ord)
      value = @@py[hz].first
    else
      value = hz.chr
    end
    arr << value
  end
  CGI.escape arr.join('-')
end

.is_dash?(codepoint) ⇒ Boolean

Returns:

  • (Boolean)


138
139
140
# File 'lib/hanzi_to_pinyin.rb', line 138

def self.is_dash?(codepoint)
  codepoint == @@dash
end

.is_hanzi?(hanzi_codepoint) ⇒ Boolean

Returns:

  • (Boolean)


126
127
128
# File 'lib/hanzi_to_pinyin.rb', line 126

def self.is_hanzi?(hanzi_codepoint)
  hanzi_codepoint >= @@hanzi_unicode_start && hanzi_codepoint <= @@hanzi_unicode_end
end

.is_letter?(codepoint) ⇒ Boolean

Returns:

  • (Boolean)


142
143
144
# File 'lib/hanzi_to_pinyin.rb', line 142

def self.is_letter?(codepoint)
  codepoint >= @@letter_upcase_start && codepoint <= @@letter_upcase_end or codepoint >= @@letter_downcase_start && codepoint <= @@letter_downcase_end
end

.is_number?(number_codepoint) ⇒ Boolean

Returns:

  • (Boolean)


130
131
132
# File 'lib/hanzi_to_pinyin.rb', line 130

def self.is_number?(number_codepoint)
  number_codepoint >= @@number_unicode_start && number_codepoint <= @@number_unicode_end
end

.is_underline?(underline_codepoint) ⇒ Boolean

Returns:

  • (Boolean)


134
135
136
# File 'lib/hanzi_to_pinyin.rb', line 134

def self.is_underline?(underline_codepoint)
  underline_codepoint == @@underline
end