Class: TinySegmenter
- Inherits:
-
Object
- Object
- TinySegmenter
- Defined in:
- lib/tiny_segmenter.rb,
lib/tiny_segmenter/version.rb
Constant Summary collapse
- WhitespaceOnlyRegex =
Regexp.compile("^[ ]+$")
- PunctuationRegex =
Regexp.compile("^[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]+$")
- VERSION =
"0.0.6"
Instance Method Summary collapse
-
#initialize ⇒ TinySegmenter
constructor
A new instance of TinySegmenter.
- #segment(text, options = {}) ⇒ Object
Constructor Details
#initialize ⇒ TinySegmenter
Returns a new instance of TinySegmenter.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/tiny_segmenter.rb', line 9 def initialize @chartype = [] @model = SegmentationModel.new @BIAS = -332 # Compile regex patterns { "[一二三四五六七八九十百千万億兆]" => "M", # numbers (japanese) "[一-龠々〆ヵヶ]" => "H", # kanji & misc characters "[ぁ-ん]" => "I", # hiragana "[ァ-ヴーア-ン゙ー]" => "K", # katakana "[a-zA-Za-zA-Z]" => "A", # ascii / romaji letters "[0-90-9]" => "N", # ascii / romaji numbers }.each do |pattern, value| @chartype << [Regexp.compile(pattern), value] end end |
Instance Method Details
#segment(text, options = {}) ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/tiny_segmenter.rb', line 26 def segment(text, = {}) return [] if text.nil? || text.strip.empty? text = text.strip result = [] segments = %w[B3 B2 B1] ctypes = %w[O O O] text.split(//).each do |char| char.strip! next if char.empty? || char.match(WhitespaceOnlyRegex) next if [:ignore_punctuation] && char.match(PunctuationRegex) segments << char ctypes << ctype(char) end segments.concat(%w[E1 E2 E3]) ctypes.concat(%w[O O O]) word = segments[3] p1, p2, p3 = %w[U U U] (4..segments.size-4).to_a.each do |i| score = @BIAS w1 = segments[i - 3] w2 = segments[i - 2] w3 = segments[i - 1] w4 = segments[i] w5 = segments[i + 1] w6 = segments[i + 2] c1 = ctypes[i - 3] c2 = ctypes[i - 2] c3 = ctypes[i - 1] c4 = ctypes[i] c5 = ctypes[i + 1] c6 = ctypes[i + 2] score += sum_scores(p1, p2, p3, w1, w2, w3, w4, w5, w6, c1, c2, c3, c4, c5, c6) p_new = "O" if score > 0 result << word word = "" p_new = "B" end p1, p2, p3 = p2, p3, p_new word = "#{word}#{segments[i]}" end result << word unless word.empty? result end |