Class: Lda::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/lda-ruby/document/document.rb,
ext/lda-ruby/lda-inference.c

Direct Known Subclasses

DataDocument, TextDocument

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ Document

Returns a new instance of Document.



7
8
9
10
11
12
13
14
15
# File 'lib/lda-ruby/document/document.rb', line 7

def initialize(corpus)
  @corpus = corpus

  @words  = Array.new
  @counts = Array.new
  @tokens = Array.new
  @length = 0
  @total  = 0
end

Instance Attribute Details

#corpusObject (readonly)

Returns the value of attribute corpus.



5
6
7
# File 'lib/lda-ruby/document/document.rb', line 5

def corpus
  @corpus
end

#countsObject (readonly)

Returns the value of attribute counts.



5
6
7
# File 'lib/lda-ruby/document/document.rb', line 5

def counts
  @counts
end

#lengthObject (readonly)

Returns the value of attribute length.



5
6
7
# File 'lib/lda-ruby/document/document.rb', line 5

def length
  @length
end

#tokensObject (readonly)

Returns the value of attribute tokens.



5
6
7
# File 'lib/lda-ruby/document/document.rb', line 5

def tokens
  @tokens
end

#totalObject (readonly)

Returns the value of attribute total.



5
6
7
# File 'lib/lda-ruby/document/document.rb', line 5

def total
  @total
end

#wordsObject (readonly)

Returns the value of attribute words.



5
6
7
# File 'lib/lda-ruby/document/document.rb', line 5

def words
  @words
end

Instance Method Details

#handle(tokens) ⇒ Object



29
30
31
# File 'lib/lda-ruby/document/document.rb', line 29

def handle(tokens)
  tokens
end

#has_text?Boolean

Returns:

  • (Boolean)


25
26
27
# File 'lib/lda-ruby/document/document.rb', line 25

def has_text?
  false
end

#recomputeObject

Recompute the total and length values.



20
21
22
23
# File 'lib/lda-ruby/document/document.rb', line 20

def recompute
  @total = @counts.inject(0) { |sum, i| sum + i }
  @length = @words.size
end

#tokenize(text) ⇒ Object



33
34
35
36
37
# File 'lib/lda-ruby/document/document.rb', line 33

def tokenize(text)
  clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase        # remove everything but letters and ' and leave only single spaces
  @tokens = handle(clean_text.split(' '))
  nil
end