Class: Lurn::Text::BernoulliVectorizer

Inherits:
Object
  • Object
show all
Defined in:
lib/lurn/text/bernoulli_vectorizer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ BernoulliVectorizer

Returns a new instance of BernoulliVectorizer.



8
9
10
11
12
13
14
15
# File 'lib/lurn/text/bernoulli_vectorizer.rb', line 8

def initialize(options = {})
  @tokenizer = options[:tokenizer] || WordTokenizer.new
  @vocabulary = []

  options[:max_df] ||= 50
  options[:min_df] ||= 0
  @options = options
end

Instance Attribute Details

#tokenizerObject

Returns the value of attribute tokenizer.



5
6
7
# File 'lib/lurn/text/bernoulli_vectorizer.rb', line 5

def tokenizer
  @tokenizer
end

#vocabularyObject

Returns the value of attribute vocabulary.



6
7
8
# File 'lib/lurn/text/bernoulli_vectorizer.rb', line 6

def vocabulary
  @vocabulary
end

Instance Method Details

#fit(documents) ⇒ Object



17
18
19
20
21
22
# File 'lib/lurn/text/bernoulli_vectorizer.rb', line 17

def fit(documents)
  @vocabulary = []
  tokenized_docs = tokenize_documents(documents)
  @vocabulary = tokenized_docs.flatten(1).uniq.sort
  reduce_features(tokenized_docs)
end

#to_hObject



24
25
26
27
28
29
# File 'lib/lurn/text/bernoulli_vectorizer.rb', line 24

def to_h
  {
    tokenizer_options: @tokenizer.to_h,
    vocabulary: @vocabulary
  }
end

#transform(documents) ⇒ Object



31
32
33
34
35
36
37
38
# File 'lib/lurn/text/bernoulli_vectorizer.rb', line 31

def transform(documents)
  documents.map do |document|
    tokens = @tokenizer.tokenize(document)
    @vocabulary.map do |word|
      tokens.include? word
    end
  end
end