Class: RMMSeg::ComplexAlgorithm

Inherits:
Object
  • Object
show all
Includes:
Algorithm
Defined in:
lib/rmmseg/complex_algorithm.rb

Constant Summary collapse

MATCH_CACHE_MAX_LENGTH =
3

Constants included from Algorithm

Algorithm::NONWORD_CHAR_RE

Instance Method Summary collapse

Methods included from Algorithm

#basic_latin?, #get_basic_latin_word, #next_token, #nonword_char?, #segment

Constructor Details

#initialize(text, token = Token) ⇒ ComplexAlgorithm

Create a new ComplexAlgorithm . Rules used by this algorithm includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .



15
16
17
18
19
20
21
22
23
24
25
# File 'lib/rmmseg/complex_algorithm.rb', line 15

def initialize(text, token=Token)
  super
  @rules = [
            MMRule,
            LAWLRule,
            SVWLRule,
            LSDMFOCWRule
           ]
  @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
  @match_cache_idx = 0
end

Instance Method Details

#create_chunksObject

Create all possible three-word (or less) chunks starting from @index .



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/rmmseg/complex_algorithm.rb', line 54

def create_chunks
  chunks = Array.new
  for w0 in find_match_words(@index)
    index0 = @index + w0.length
    if index0 < @chars.length
      for w1 in find_match_words(index0)
        index1 = index0 + w1.length
        if index1 < @chars.length
          for w2 in find_match_words(index1)
            if w2.type == Word::TYPES[:unrecognized]
              chunks << [w0, w1]
            else
              chunks << [w0, w1, w2]
            end
          end
        elsif index1 == @chars.length
          chunks << [w0, w1]
        end
      end
    elsif index0 == @chars.length
      chunks << [w0]
    end
  end

  chunks
end

#find_match_words(index) ⇒ Object

Find all words occuring in the dictionary starting from index . The maximum word length is determined by Config.max_word_length .



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/rmmseg/complex_algorithm.rb', line 84

def find_match_words(index)
  for i, w in @match_cache
    if i == index
      return w
    end
  end
  
  dic = Dictionary.instance
  str = String.new
  strlen = 0
  words = Array.new
  i = index

  while i < @chars.length               &&
      !basic_latin?(@chars[i])          &&
      strlen < Config.max_word_length
    
    str << @chars[i]
    strlen += 1
    
    if dic.has_word?(str)
      words << dic.get_word(str)
    end
    i += 1
  end

  if words.empty?
    words << Word.new(@chars[index], Word::TYPES[:unrecognized])
  end

  @match_cache[@match_cache_idx] = [index, words]
  @match_cache_idx += 1
  @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH

  words
end

#get_cjk_wordObject

Get the most proper CJK word.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/rmmseg/complex_algorithm.rb', line 28

def get_cjk_word
  chunks = create_chunks
  i = 0
  while i < @rules.length
    break if chunks.length < 2
    chunks = @rules[i].filter(chunks)
    i += 1
  end

  if chunks.length > 1
    if Config.on_ambiguity == :raise_exception
      raise Ambiguity, "Can't solve ambiguity on #{chunks}"
    end
  end

  word = chunks[0][0]
  token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
  
  @index += word.length
  @byte_index += word.byte_size

  return token
end