Class: PragmaticTokenizer::EndingPunctuationSeparator

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/ending_punctuation_separator.rb

Overview

This class separates ending punctuation from a token

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(tokens:) ⇒ EndingPunctuationSeparator

Returns a new instance of EndingPunctuationSeparator.



7
8
9
# File 'lib/pragmatic_tokenizer/ending_punctuation_separator.rb', line 7

def initialize(tokens:)
  @tokens = tokens
end

Instance Attribute Details

#tokensObject (readonly)

Returns the value of attribute tokens.



6
7
8
# File 'lib/pragmatic_tokenizer/ending_punctuation_separator.rb', line 6

def tokens
  @tokens
end

Instance Method Details

#separateObject



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/pragmatic_tokenizer/ending_punctuation_separator.rb', line 11

def separate
  cleaned_tokens = []
  tokens.each do |a|
    split_punctuation = a.scan(/(?<=\S)[。.!!??]+$/)
    if split_punctuation[0].nil?
      cleaned_tokens << a
    else
      cleaned_tokens << a.tr(split_punctuation[0], '')
      if split_punctuation[0].length.eql?(1)
        cleaned_tokens << split_punctuation[0]
      else
        split_punctuation[0].split("").each do |s|
          cleaned_tokens << s
        end
      end
    end
  end
  cleaned_tokens
end