Class: PragmaticTokenizer::PostProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/post_processor.rb

Constant Summary collapse

DOT =
'.'.freeze
RANGE_DINGBATS =

e.g. ✁✎✳❄➾

'[\u2701-\u27BE]'.freeze
RANGE_VARIATION_SELECTORS =

alter the previous character

'[\uFE00-\uFE0F]'.freeze
RANGE_FULLWIDTH =

e.g. !"#'?

'[\uFF01-\ufF1F]'.freeze
REGEXP_COMMAS =
/^([,‚])+/
REGEXP_SINGLE_QUOTES =
/(.+)([’'‘`])$/
REGEXP_SLASH =
/^(?!(https?:|www\.))(.*)\//
REGEXP_QUESTION_MARK =
/^(?!(https?:|www\.))(.*)(\?)/
REGEXP_PLUS_SIGN =
/(.+)\+(.+)/
REGEXP_COLON =
/^(:)(\S{2,})/
REGEXP_DINGBATS =
/(#{RANGE_DINGBATS}#{RANGE_VARIATION_SELECTORS}*)/
REGEXP_ENDING_PUNCT =
/(?<=\S)([#{RANGE_FULLWIDTH}!?]+)$/
REGEXP_DOMAIN =
/^((https?:\/\/|)?[a-z0-9]+([\-\.][a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?)$/ix
REGEXP_EMAIL =
/\S+[@@]\S+/
REGEXP_DOMAIN_START =
/^(https?:|www\.|[[:alpha:]]\.)/
REGEXP_DOMAIN_END =
/\.(com|net|org|edu|gov|mil|int|[[:alpha:]]{2})$/
REGEXP_DIGIT =
/[[:digit:]]+/
REGEXP_PERIOD1 =
/(.*\.)/
REGEXP_PERIOD2 =
/(\.)/
REGEX_UNIFIED1 =
Regexp.union(REGEXP_SLASH,
REGEXP_QUESTION_MARK,
REGEXP_PLUS_SIGN,
REGEXP_COLON,
REGEXP_DINGBATS,
PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
REGEX_UNIFIED2 =
Regexp.union(REGEXP_SINGLE_QUOTES,
REGEXP_COMMAS)
REGEX_DOMAIN_EMAIL =
Regexp.union(REGEXP_DOMAIN,
REGEXP_EMAIL)
REGEX_DOMAIN =
Regexp.union(REGEXP_DOMAIN_START,
REGEXP_DOMAIN_END)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text:, abbreviations:, downcase:) ⇒ PostProcessor

Returns a new instance of PostProcessor.



44
45
46
47
48
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 44

def initialize(text:, abbreviations:, downcase:)
  @text          = text
  @abbreviations = abbreviations
  @downcase      = downcase
end

Instance Attribute Details

#abbreviationsObject (readonly)

Returns the value of attribute abbreviations.



42
43
44
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 42

def abbreviations
  @abbreviations
end

#downcaseObject (readonly)

Returns the value of attribute downcase.



42
43
44
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 42

def downcase
  @downcase
end

#textObject (readonly)

Returns the value of attribute text.



42
43
44
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 42

def text
  @text
end

Instance Method Details

#post_processObject



50
51
52
# File 'lib/pragmatic_tokenizer/post_processor.rb', line 50

def post_process
  procs.reduce(full_stop_separated_tokens) { |a, e| a.flat_map(&e) }
end