Class: Eluka::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/eluka/document.rb

Instance Method Summary collapse

Constructor Details

#initialize(field, text, analyzer) ⇒ Document

Returns a new instance of Document.



5
6
7
8
9
10
11
# File 'lib/eluka/document.rb', line 5

def initialize(field, text, analyzer)
  @field        = field
  @text         = text
  @analyzer     = analyzer
  @bag_of_words = nil
  self.bag_of_words
end

Instance Method Details

#bag_of_wordsObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/eluka/document.rb', line 13

def bag_of_words
  #Position counter for the document
  pos = 0
  
  @bag_of_words = Hash.new
  
  #Token Stream
  token_stream = @analyzer.token_stream(:field, @text)
  while token = token_stream.next do      
    pos += token.pos_inc
  
    @bag_of_words[token.text] = Array.new unless @bag_of_words[token.text] 
    @bag_of_words[token.text].push(pos)    
  end
  
end

#vectorObject



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/eluka/document.rb', line 30

def vector
  vector = Hash.new
  squared_length = 0
  @bag_of_words.each do |term, pos_vector|
    squared_length += pos_vector.size**2
    #vector[[@field,term].join("||")] = pos_vector.size
  end
  
  length = squared_length.to_f**0.5
  @bag_of_words.each do |term, pos_vector|      
    vector[[@field,term].join("||")] = pos_vector.size.to_f / length
  end
  
  vector
end