Class: PragmaticTokenizer::PostProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/post_processor.rb

Constant Summary collapse

REGEX_SYMBOL =
/[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/
REGEXP_COMMAS =
/^(,|‚)+/
REGEXP_SINGLE_QUOTES =
/(.+)(’|'|‘|`)$/
REGEXP_SLASH =
/^(?!(https?:|www\.))(.*)\/(.*)/
REGEXP_QUESTION_MARK =
/^(?!(https?:|www\.))(.*)(\?)(.*)/
REGEXP_PLUS_SIGN =
/(.+)\+(.+)/
REGEXP_COLON =
/^(\:)(\S{2,})/
REGEXP_EMOJI =
/(\u{2744}[\u{FE0E}|\u{FE0F}])/
REGEX_UNIFIED1 =
Regexp.union(REGEXP_SLASH,
REGEXP_QUESTION_MARK,
REGEXP_PLUS_SIGN,
REGEXP_COLON,
REGEXP_EMOJI,
PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
REGEX_UNIFIED2 =
Regexp.union(REGEXP_SINGLE_QUOTES,
REGEXP_COMMAS)
REGEXP_UNKNOWN1 =
/(?<=\S)([。.!!??]+)$/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text:, abbreviations:, downcase:) ⇒ PostProcessor

Returns a new instance of PostProcessor.



27
28
29
30
31
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 27

def initialize(text:, abbreviations:, downcase:)
  @text          = text
  @abbreviations = abbreviations
  @downcase      = downcase
end

Instance Attribute Details

#abbreviationsObject (readonly)

Returns the value of attribute abbreviations.



25
26
27
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 25

def abbreviations
  @abbreviations
end

#downcaseObject (readonly)

Returns the value of attribute downcase.



25
26
27
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 25

def downcase
  @downcase
end

#textObject (readonly)

Returns the value of attribute text.



25
26
27
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 25

def text
  @text
end

Instance Method Details

#post_processObject



33
34
35
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 33

def post_process
  separate_ending_punctuation(post_process_punctuation)
end