Class: Lurn::Text::WordCountVectorizer

Inherits:

Object

Object
Lurn::Text::WordCountVectorizer

show all

Defined in:: lib/lurn/text/word_count_vectorizer.rb

Instance Attribute Summary collapse

#tokenizer ⇒ Object

Returns the value of attribute tokenizer.
#vocabulary ⇒ Object

Returns the value of attribute vocabulary.

Instance Method Summary collapse

#fit(documents) ⇒ Object
#initialize(options = {}) ⇒ WordCountVectorizer constructor

A new instance of WordCountVectorizer.
#to_h ⇒ Object
#transform(documents) ⇒ Object

Constructor Details

#initialize(options = {}) ⇒ `WordCountVectorizer`

Returns a new instance of WordCountVectorizer.

# File 'lib/lurn/text/word_count_vectorizer.rb', line 8

def initialize(options = {})
  @tokenizer = options[:tokenizer] || WordTokenizer.new
  @vocabulary = []

  options[:max_df] ||= 50
  options[:min_df] ||= 0
  @options = options
end

Instance Attribute Details

#tokenizer ⇒ `Object`

Returns the value of attribute tokenizer.



5
6
7

# File 'lib/lurn/text/word_count_vectorizer.rb', line 5

def tokenizer
  @tokenizer
end

#vocabulary ⇒ `Object`

Returns the value of attribute vocabulary.



6
7
8

# File 'lib/lurn/text/word_count_vectorizer.rb', line 6

def vocabulary
  @vocabulary
end

Instance Method Details

#fit(documents) ⇒ `Object`

# File 'lib/lurn/text/word_count_vectorizer.rb', line 17

def fit(documents)
  @vocabulary = []
  tokenized_docs = tokenize_documents(documents)
  @vocabulary = tokenized_docs.flatten(1).uniq.sort
  reduce_features(tokenized_docs)
end

#to_h ⇒ `Object`

# File 'lib/lurn/text/word_count_vectorizer.rb', line 24

def to_h
  {
    tokenizer_options: @tokenizer.to_h,
    vocabulary: @vocabulary
  }
end

#transform(documents) ⇒ `Object`

# File 'lib/lurn/text/word_count_vectorizer.rb', line 31

def transform(documents)
  documents.map do |document|
    tokens = @tokenizer.tokenize(document)
    @vocabulary.map do |word|
      tokens.count word
    end
  end
end

Class: Lurn::Text::WordCountVectorizer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ WordCountVectorizer

Instance Attribute Details

#tokenizer ⇒ Object

#vocabulary ⇒ Object

Instance Method Details

#fit(documents) ⇒ Object

#to_h ⇒ Object

#transform(documents) ⇒ Object

#initialize(options = {}) ⇒ `WordCountVectorizer`

#tokenizer ⇒ `Object`

#vocabulary ⇒ `Object`

#fit(documents) ⇒ `Object`

#to_h ⇒ `Object`

#transform(documents) ⇒ `Object`