Class: PragmaticTokenizer::PostProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/post_processor.rb

Constant Summary collapse

REGEX_SYMBOL =
/[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/
REGEXP_COMMAS =
/^(,|‚)+/
REGEXP_SINGLE_QUOTES =
/(.+)(’|'|‘|`)$/
REGEXP_SLASH =
/^(?!(https?:|www\.))(.*)\/(.*)/
REGEXP_QUESTION_MARK =
/^(?!(https?:|www\.))(.*)(\?)(.*)/
REGEXP_PLUS_SIGN =
/(.+)\+(.+)/
REGEXP_COLON =
/^(\:)(\S{2,})/
REGEXP_EMOJI =
/(\u{2744}[\u{FE0E}|\u{FE0F}])/
REGEX_UNIFIED1 =
Regexp.union(REGEXP_SLASH,
REGEXP_QUESTION_MARK,
REGEXP_PLUS_SIGN,
REGEXP_COLON,
REGEXP_EMOJI,
PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
REGEX_UNIFIED2 =
Regexp.union(REGEXP_SINGLE_QUOTES,
REGEXP_COMMAS)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text:, abbreviations:, downcase:) ⇒ PostProcessor

Returns a new instance of PostProcessor.



26
27
28
29
30
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 26

def initialize(text:, abbreviations:, downcase:)
  @text          = text
  @abbreviations = abbreviations
  @downcase      = downcase
end

Instance Attribute Details

#abbreviationsObject (readonly)

Returns the value of attribute abbreviations.



24
25
26
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 24

def abbreviations
  @abbreviations
end

#downcaseObject (readonly)

Returns the value of attribute downcase.



24
25
26
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 24

def downcase
  @downcase
end

#textObject (readonly)

Returns the value of attribute text.



24
25
26
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 24

def text
  @text
end

Instance Method Details

#post_processObject



32
33
34
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 32

def post_process
  EndingPunctuationSeparator.new(tokens: method_name3).separate
end