Class: PinYin::Backend::MMSeg

Inherits:
Object
  • Object
show all
Defined in:
lib/ruby-pinyin/backend/mmseg.rb

Instance Method Summary collapse

Constructor Details

#initialize(override_files = []) ⇒ MMSeg

Returns a new instance of MMSeg.



9
10
11
12
13
14
15
# File 'lib/ruby-pinyin/backend/mmseg.rb', line 9

def initialize(override_files=[])
  @simple = Simple.new override_files

  RMMSeg::Dictionary.dictionaries.delete_if {|(type, path)| type == :words}
  RMMSeg::Dictionary.dictionaries.push [:words, File.expand_path('../../data/words.dic', __FILE__)]
  RMMSeg::Dictionary.load_dictionaries
end

Instance Method Details

#romanize(str, tone = nil, include_punctuations = false) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/ruby-pinyin/backend/mmseg.rb', line 17

def romanize(str, tone=nil, include_punctuations=false)
  return [] unless str && str.length > 0

  words = segment str

  base = @simple.romanize(str, tone, include_punctuations)
  patch = words.map {|w| format(w, tone) }.flatten

  if base.size != patch.size
    base.compact!
    patch.compact!
  end

  apply base, patch
end

#segment(str) ⇒ Object



33
34
35
36
37
38
39
40
41
42
# File 'lib/ruby-pinyin/backend/mmseg.rb', line 33

def segment(str)
  algor = RMMSeg::Algorithm.new str

  words = []
  while token = algor.next_token
    s = token.text.force_encoding("UTF-8")
    words.push(s) unless s =~ Punctuation.chinese_regexp
  end
  words
end