Class: Lda::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/lda-ruby/document/document.rb,
ext/lda-ruby/lda-inference.c

Direct Known Subclasses

DataDocument, TextDocument

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ Document

Returns a new instance of Document.



8
9
10
11
12
13
14
15
16
# File 'lib/lda-ruby/document/document.rb', line 8

def initialize(corpus)
  @corpus = corpus

  @words  = Array.new
  @counts = Array.new
  @tokens = Array.new
  @length = 0
  @total  = 0
end

Instance Attribute Details

#corpusObject (readonly)

Returns the value of attribute corpus.



6
7
8
# File 'lib/lda-ruby/document/document.rb', line 6

def corpus
  @corpus
end

#countsObject (readonly)

Returns the value of attribute counts.



6
7
8
# File 'lib/lda-ruby/document/document.rb', line 6

def counts
  @counts
end

#lengthObject (readonly)

Returns the value of attribute length.



6
7
8
# File 'lib/lda-ruby/document/document.rb', line 6

def length
  @length
end

#tokensObject (readonly)

Returns the value of attribute tokens.



6
7
8
# File 'lib/lda-ruby/document/document.rb', line 6

def tokens
  @tokens
end

#totalObject (readonly)

Returns the value of attribute total.



6
7
8
# File 'lib/lda-ruby/document/document.rb', line 6

def total
  @total
end

#wordsObject (readonly)

Returns the value of attribute words.



6
7
8
# File 'lib/lda-ruby/document/document.rb', line 6

def words
  @words
end

Instance Method Details

#handle(tokens) ⇒ Object



30
31
32
# File 'lib/lda-ruby/document/document.rb', line 30

def handle(tokens)
  tokens
end

#has_text?Boolean

Returns:

  • (Boolean)


26
27
28
# File 'lib/lda-ruby/document/document.rb', line 26

def has_text?
  false
end

#recomputeObject

Recompute the total and length values.



21
22
23
24
# File 'lib/lda-ruby/document/document.rb', line 21

def recompute
  @total = @counts.inject(0) { |sum, i| sum + i }
  @length = @words.size
end

#tokenize(text) ⇒ Object



34
35
36
37
38
# File 'lib/lda-ruby/document/document.rb', line 34

def tokenize(text)
  clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase  # remove everything but letters and ' and leave only single spaces
  @tokens = handle(clean_text.split(' '))
  nil
end