Class: TwitterCldr::Segmentation::CjBreakEngine

Inherits:
DictionaryBreakEngine show all
Includes:
Singleton
Defined in:
lib/twitter_cldr/segmentation/cj_break_engine.rb

Direct Known Subclasses

KoreanBreakEngine

Constant Summary collapse

MAX_WORD_SIZE =

magic number pulled from ICU’s source code, presumably slightly longer than the longest Chinese/Japanese/Korean word

20
MAX_SNLP =

magic number pulled from ICU’s source code

255
LARGE_NUMBER =

the equivalent of Java’s Integer.MAX_VALUE

0xFFFFFFFF
MAX_KATAKANA_LENGTH =
8
MAX_KATAKANA_GROUP_LENGTH =
20
KATAKANA_COSTS =
[8192, 984, 408, 240, 204, 252, 300, 372, 480].freeze
MAX_KATAKANA_COST =
8192

Class Method Summary collapse

Methods inherited from DictionaryBreakEngine

#each_boundary

Class Method Details

.word_setObject



29
30
31
32
33
34
35
36
37
38
39
# File 'lib/twitter_cldr/segmentation/cj_break_engine.rb', line 29

def self.word_set
  @word_set ||= begin
    uset = TwitterCldr::Shared::UnicodeSet.new
    uset.apply_pattern('[:Han:]')
    uset.apply_pattern('[[:Katakana:]\uff9e\uff9f]')
    uset.apply_pattern('[:Hiragana:]')
    uset.add(0xFF70)  # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
    uset.add(0x30FC)  # KATAKANA-HIRAGANA PROLONGED SOUND MARK
    uset.to_set
  end
end