Class: BowTfidf::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/bow_tfidf/tokenizer.rb

Constant Summary collapse

SPLIT_REGEX =
/[\s\n\t\.,\-\!:()\/%\\+\|@^<«>*'~;=»\?—•$”\"’\[£“■‘\{#®♦°™€¥\]©§\}–]/
TOKEN_MIN_LENGTH =
3
TOKEN_MAX_LENGTH =
15

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeTokenizer

Returns a new instance of Tokenizer.



9
10
11
# File 'lib/bow_tfidf/tokenizer.rb', line 9

def initialize
  @tokens = Set[]
end

Instance Attribute Details

#tokensObject (readonly)

Returns the value of attribute tokens.



7
8
9
# File 'lib/bow_tfidf/tokenizer.rb', line 7

def tokens
  @tokens
end

Instance Method Details

#call(text) ⇒ Object

Raises:

  • (ArgumentError)


13
14
15
16
17
18
19
20
21
22
23
# File 'lib/bow_tfidf/tokenizer.rb', line 13

def call(text)
  raise(ArgumentError, 'String instance expected') unless text.is_a?(String)

  raw_tokens = split(text)

  raw_tokens.each do |token|
    process_token(token)
  end

  tokens
end