Class: PragmaticTokenizer::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/tokenizer.rb

Constant Summary collapse

PUNCTIATION_OPTIONS =
Set.new([:all, :semi, :none, :only]).freeze
NUMBERS_OPTIONS =
Set.new([:all, :semi, :none, :only]).freeze
MENTIONS_OPTIONS =
Set.new([:keep_original, :keep_and_clean, :remove]).freeze
MAX_TOKEN_LENGTH =
50
EMPTY_STRING =
''.freeze
DOT_STRING =
'.'.freeze
SPACE_STRING =
' '.freeze
REGEX_DOMAIN =
/(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
REGEX_URL =
/(http|https)(\.|:)/
REGEX_HYPHEN =
/\-/
REGEX_LONG_WORD =
/\-|\_/
REGEXP_SPLIT_CHECK =
/@|@|(http)/
REGEX_CONTRACTIONS =
/[‘’‚‛‹›'´`]/
REGEX_APOSTROPHE_S =
/['’`́]s$/
REGEX_EMAIL =
/\S+(@|@)\S+\.\S+/
REGEX_HASHTAG_OR_MENTION =
/[@@#|#]/
REGEX_UNDERSCORE_AT_START =
/(?<=\A)\_+/
REGEX_UNDERSCORE_AT_END =
/\_+(?=\z)/
REGEX_ASTERISK =
/\*+/
REGEX_UNIFIED1 =
Regexp.union(REGEX_UNDERSCORE_AT_START,
REGEX_UNDERSCORE_AT_END,
REGEX_ASTERISK)
REGEXP_CONTROL =

en.wikipedia.org/wiki/Control_character matches any character with hexadecimal value 00 through 1F or 7F. Rubular: rubular.com/r/E83fpBoDjI

/[[:cntrl:]]/
REGEXP_ENDING_COLON =
/\:(?=\z)/
REGEXP_EXCLAMATION_AT_START =
/(?<=\A)!+(?=.+)/
REGEXP_EXCLAMATION_AT_END =
/!+(1*!*)*(?=\z)/
REGEXP_HYPHEN_AT_START =
/\A(-|–|\u{00AD})/
REGEXP_SPECIAL_SYMBOL =
/[®©]/
REGEXP_PERCENT_AT_START =
/\A\%/
REGEXP_ALPHANUMERIC_SUPPLEMENT =
/[\u{1F100}-\u{1F1FF}]/
REGEX_UNIFIED2 =
Regexp.union(REGEXP_CONTROL,
REGEXP_ENDING_COLON,
REGEXP_EXCLAMATION_AT_START,
REGEXP_EXCLAMATION_AT_END,
REGEXP_HYPHEN_AT_START,
REGEXP_SPECIAL_SYMBOL,
REGEXP_PERCENT_AT_START,
REGEXP_ALPHANUMERIC_SUPPLEMENT)
REGEXP_ONE_AS_EXCLAMATION =
/(?<=\D)1+(?=\z)/
REGEXP_HASHTAG_AT_START =
/(?<=\A)(#|#)/
REGEXP_AT_SIGN_AT_START =
/(?<=\A)(@|@)/
REGEXP_HYPHEN_HASTAG =
/\A(#|#)\S+-/
REGEXP_EMOJI_SNOWFLAKE =
/\u{2744}[\u{FE0F}|\u{FE0E}]?/
REGEX_EMOJI_UNIFIED =
Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
REGEXP_PUNCTUATION_ONLY =
/\A[[:punct:]]+\z/
REGEXP_NUMBER_ONLY =
/\A\d+\z/
REGEXP_NO_NUMBERS =
/\A\D+\z/
REGEXP_NUMBER =
/\D*\d+\d*/
REGEXP_CONSECUTIVE_DOTS =
/\A\.{2,}\z/
REGEXP_CHUNK_STRING =
/\S.{1,10000}(?!\S)/m

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.

Parameters:

  • opts (Hash) (defaults to: {})

    a customizable set of options

Options Hash (opts):

  • :minimum_length (Integer)
    • minimum length of the token in characters

  • :long_word_split (Integer)
    • the specified length to split long words at any hyphen or underscore.

  • :mentions (String)
    • :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don’t alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or ‘keep_and_clean’)

  • :hashtags (String)
    • :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don’t alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or ‘keep_and_clean’)

  • :downcase (Boolean)
    • (default: true)

  • :clean (Boolean)
    • (default: false)

  • :classic_filter (Boolean)
    • removes dots from acronyms and ‘s from the end of tokens - (default: false)

  • :remove_emoji (Boolean)
    • (default: false)

  • :remove_emails (Boolean)
    • (default: false)

  • :remove_urls (Boolean)
    • (default: false)

  • :remove_domains (Boolean)
    • (default: false)



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/pragmatic_tokenizer/tokenizer.rb', line 105

def initialize(opts={})
  @filter_languages    = opts[:filter_languages] || []
  @language_module     = Languages.get_language_by_code(opts[:language])
  @expand_contractions = opts[:expand_contractions]
  @remove_stop_words   = opts[:remove_stop_words]
  @punctuation         = opts[:punctuation] ? opts[:punctuation].to_sym : :all
  @numbers             = opts[:numbers] ? opts[:numbers].to_sym : :all
  @minimum_length      = opts[:minimum_length] || 0
  @long_word_split     = opts[:long_word_split]
  @mentions            = opts[:mentions] ? opts[:mentions].to_sym : :keep_original
  @hashtags            = opts[:hashtags] ? opts[:hashtags].to_sym : :keep_original
  @downcase            = opts[:downcase].nil? ? true : opts[:downcase]
  @clean               = opts[:clean]
  @classic_filter      = opts[:classic_filter]
  @remove_emoji        = opts[:remove_emoji]
  @remove_emails       = opts[:remove_emails]
  @remove_urls         = opts[:remove_urls]
  @remove_domains      = opts[:remove_domains]
  @contractions        = opts[:contractions] || {}
  @abbreviations       = Set.new(opts[:abbreviations])
  @stop_words          = Set.new(opts[:stop_words])

  # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
  @abbreviations       += @language_module::ABBREVIATIONS if @abbreviations.empty?
  @stop_words          += @language_module::STOP_WORDS if @stop_words.empty?

  @filter_languages.each do |lang|
    language = Languages.get_language_by_code(lang)
    @contractions.merge!(language::CONTRACTIONS)
    @abbreviations += language::ABBREVIATIONS
    @stop_words    += language::STOP_WORDS
  end

  raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)

  integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer

  raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
  raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
end

Instance Method Details

#tokenize(text) ⇒ Object

Parameters:

  • text (String)

    to be tokenized



151
152
153
154
155
156
157
# File 'lib/pragmatic_tokenizer/tokenizer.rb', line 151

def tokenize(text)
  return [] unless text
  raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
  CGI.unescapeHTML(text)
      .scan(REGEXP_CHUNK_STRING)
      .flat_map { |segment| post_process(pre_process(segment)) }
end