Class: PragmaticTokenizer::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/tokenizer.rb

Constant Summary collapse

PUNCTUATION_OPTIONS =
Set.new(%i[all semi none only]).freeze
NUMBERS_OPTIONS =
Set.new(%i[all semi none only]).freeze
MENTIONS_OPTIONS =
Set.new(%i[keep_original keep_and_clean remove]).freeze
MAX_TOKEN_LENGTH =
50
NOTHING =
''.freeze
DOT =
'.'.freeze
SPACE =
' '.freeze
SINGLE_QUOTE =
"'".freeze

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.

Parameters:

  • opts (Hash) (defaults to: {})

    a customizable set of options

Options Hash (opts):

  • :minimum_length (Integer)
    • minimum length of the token in characters

  • :long_word_split (Integer)
    • the specified length to split long words at any hyphen or underscore.

  • :mentions (String)
    • :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don’t alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or ‘keep_and_clean’)

  • :hashtags (String)
    • :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don’t alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or ‘keep_and_clean’)

  • :downcase (Boolean)
    • (default: true)

  • :clean (Boolean)
    • (default: false)

  • :classic_filter (Boolean)
    • removes dots from acronyms and ‘s from the end of tokens - (default: false)

  • :remove_emoji (Boolean)
    • (default: false)

  • :remove_emails (Boolean)
    • (default: false)

  • :remove_urls (Boolean)
    • (default: false)

  • :remove_domains (Boolean)
    • (default: false)



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/pragmatic_tokenizer/tokenizer.rb', line 57

def initialize(opts={})
  @filter_languages    = opts[:filter_languages] || []
  @language_module     = Languages.get_language_by_code(opts[:language])
  @expand_contractions = opts[:expand_contractions]
  @remove_stop_words   = opts[:remove_stop_words]
  @punctuation         = opts[:punctuation] ? opts[:punctuation].to_sym : :all
  @numbers             = opts[:numbers] ? opts[:numbers].to_sym : :all
  @minimum_length      = opts[:minimum_length] || 0
  @long_word_split     = opts[:long_word_split]
  @mentions            = opts[:mentions] ? opts[:mentions].to_sym : :keep_original
  @hashtags            = opts[:hashtags] ? opts[:hashtags].to_sym : :keep_original
  @downcase            = opts[:downcase].nil? ? true : opts[:downcase]
  @clean               = opts[:clean]
  @classic_filter      = opts[:classic_filter]
  @remove_emoji        = opts[:remove_emoji]
  @remove_emails       = opts[:remove_emails]
  @remove_urls         = opts[:remove_urls]
  @remove_domains      = opts[:remove_domains]
  @contractions        = opts[:contractions] || {}
  @abbreviations       = Set.new(opts[:abbreviations])
  @stop_words          = Set.new(opts[:stop_words])

  # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
  @abbreviations       += @language_module::ABBREVIATIONS if @abbreviations.empty?
  @stop_words          += @language_module::STOP_WORDS if @stop_words.empty?

  @filter_languages.each do |lang|
    language = Languages.get_language_by_code(lang)
    @contractions.merge!(language::CONTRACTIONS)
    @abbreviations += language::ABBREVIATIONS
    @stop_words    += language::STOP_WORDS
  end

  raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)

  integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer

  raise "In Pragmatic Tokenizer minimum_length must be an Integer"  unless @minimum_length.class  == integer_class || @minimum_length.nil?
  raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
end

Instance Method Details

#tokenize(text) ⇒ Object

Parameters:

  • text (String)

    to be tokenized



103
104
105
106
107
108
109
# File 'lib/pragmatic_tokenizer/tokenizer.rb', line 103

def tokenize(text)
  return [] unless text
  raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
  CGI.unescapeHTML(text)
      .scan(Regex::CHUNK_LONG_INPUT_TEXT)
      .flat_map { |segment| process_segment(segment) }
end