Class: PragmaticTokenizer::Tokenizer

Inherits:

Object

Object
PragmaticTokenizer::Tokenizer

Defined in:: lib/pragmatic_tokenizer/tokenizer.rb

Constant Summary collapse

PUNCTUATION_OPTIONS =

Set.new(%i[all semi none only]).freeze

NUMBERS_OPTIONS =

Set.new(%i[all semi none only]).freeze

MENTIONS_OPTIONS =

Set.new(%i[keep_original keep_and_clean remove]).freeze

MAX_TOKEN_LENGTH =

NOTHING =

''.freeze

DOT =

'.'.freeze

SPACE =

' '.freeze

SINGLE_QUOTE =

"'".freeze

Instance Method Summary collapse

#initialize(opts = {}) ⇒ Tokenizer constructor

A new instance of Tokenizer.
#tokenize(text) ⇒ Object

Constructor Details

#initialize(opts = {}) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

Parameters:

opts (Hash) (defaults to: {}) —

a customizable set of options

Options Hash (opts):

:minimum_length (Integer) —
- minimum length of the token in characters
:long_word_split (Integer) —
- the specified length to split long words at any hyphen or underscore.
:mentions (String) —
- :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don’t alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or ‘keep_and_clean’)
:hashtags (String) —
- :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don’t alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or ‘keep_and_clean’)
:downcase (Boolean) —
- (default: true)
:clean (Boolean) —
- (default: false)
:classic_filter (Boolean) —
- removes dots from acronyms and ‘s from the end of tokens - (default: false)
:remove_emoji (Boolean) —
- (default: false)
:remove_emails (Boolean) —
- (default: false)
:remove_urls (Boolean) —
- (default: false)
:remove_domains (Boolean) —
- (default: false)

# File 'lib/pragmatic_tokenizer/tokenizer.rb', line 57

def initialize(opts={})
  @filter_languages    = opts[:filter_languages] || []
  @language_module     = Languages.get_language_by_code(opts[:language])
  @expand_contractions = opts[:expand_contractions]
  @remove_stop_words   = opts[:remove_stop_words]
  @punctuation         = opts[:punctuation] ? opts[:punctuation].to_sym : :all
  @numbers             = opts[:numbers] ? opts[:numbers].to_sym : :all
  @minimum_length      = opts[:minimum_length] || 0
  @long_word_split     = opts[:long_word_split]
  @mentions            = opts[:mentions] ? opts[:mentions].to_sym : :keep_original
  @hashtags            = opts[:hashtags] ? opts[:hashtags].to_sym : :keep_original
  @downcase            = opts[:downcase].nil? ? true : opts[:downcase]
  @clean               = opts[:clean]
  @classic_filter      = opts[:classic_filter]
  @remove_emoji        = opts[:remove_emoji]
  @remove_emails       = opts[:remove_emails]
  @remove_urls         = opts[:remove_urls]
  @remove_domains      = opts[:remove_domains]
  @contractions        = opts[:contractions] || {}
  @abbreviations       = Set.new(opts[:abbreviations])
  @stop_words          = Set.new(opts[:stop_words])

  # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
  @abbreviations       += @language_module::ABBREVIATIONS if @abbreviations.empty?
  @stop_words          += @language_module::STOP_WORDS if @stop_words.empty?

  @filter_languages.each do |lang|
    language = Languages.get_language_by_code(lang)
    @contractions.merge!(language::CONTRACTIONS)
    @abbreviations += language::ABBREVIATIONS
    @stop_words    += language::STOP_WORDS
  end

  raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)

  integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer

  raise "In Pragmatic Tokenizer minimum_length must be an Integer"  unless @minimum_length.class  == integer_class || @minimum_length.nil?
  raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
end

Instance Method Details

#tokenize(text) ⇒ `Object`