Class: PragmaticTokenizer::Tokenizer
- Inherits:
-
Object
- Object
- PragmaticTokenizer::Tokenizer
- Defined in:
- lib/pragmatic_tokenizer/tokenizer.rb
Constant Summary collapse
- PUNCTUATION_OPTIONS =
Set.new(%i[all semi none only]).freeze
- NUMBERS_OPTIONS =
Set.new(%i[all semi none only]).freeze
- MENTIONS_OPTIONS =
Set.new(%i[keep_original keep_and_clean remove]).freeze
- MAX_TOKEN_LENGTH =
50
- NOTHING =
''.freeze
- DOT =
'.'.freeze
- SPACE =
' '.freeze
- SINGLE_QUOTE =
"'".freeze
Instance Method Summary collapse
-
#initialize(opts = {}) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #tokenize(text) ⇒ Object
Constructor Details
#initialize(opts = {}) ⇒ Tokenizer
Returns a new instance of Tokenizer.
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/pragmatic_tokenizer/tokenizer.rb', line 57 def initialize(opts={}) @filter_languages = opts[:filter_languages] || [] @language_module = Languages.get_language_by_code(opts[:language]) @expand_contractions = opts[:expand_contractions] @remove_stop_words = opts[:remove_stop_words] @punctuation = opts[:punctuation] ? opts[:punctuation].to_sym : :all @numbers = opts[:numbers] ? opts[:numbers].to_sym : :all @minimum_length = opts[:minimum_length] || 0 @long_word_split = opts[:long_word_split] @mentions = opts[:mentions] ? opts[:mentions].to_sym : :keep_original @hashtags = opts[:hashtags] ? opts[:hashtags].to_sym : :keep_original @downcase = opts[:downcase].nil? ? true : opts[:downcase] @clean = opts[:clean] @classic_filter = opts[:classic_filter] @remove_emoji = opts[:remove_emoji] @remove_emails = opts[:remove_emails] @remove_urls = opts[:remove_urls] @remove_domains = opts[:remove_domains] @contractions = opts[:contractions] || {} @abbreviations = Set.new(opts[:abbreviations]) @stop_words = Set.new(opts[:stop_words]) # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages) @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty? @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty? @stop_words += @language_module::STOP_WORDS if @stop_words.empty? @filter_languages.each do |lang| language = Languages.get_language_by_code(lang) @contractions.merge!(language::CONTRACTIONS) @abbreviations += language::ABBREVIATIONS @stop_words += language::STOP_WORDS end raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation) raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers) raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions) integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil? raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil? end |
Instance Method Details
#tokenize(text) ⇒ Object
103 104 105 106 107 108 109 |
# File 'lib/pragmatic_tokenizer/tokenizer.rb', line 103 def tokenize(text) return [] unless text raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String CGI.unescapeHTML(text) .scan(Regex::CHUNK_LONG_INPUT_TEXT) .flat_map { |segment| process_segment(segment) } end |