Method: Cass::Context#initialize

Defined in:
lib/cass/context.rb

#initialize(doc, opts) ⇒ Context

Returns a new instance of Context.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/cass/context.rb', line 8

def initialize(doc, opts)
  min_prop = opts['min_prop'] || 0
  max_prop = opts['max_prop'] || 1
  if (defined?(VERBOSE) and VERBOSE)
    puts "Creating new context..." 
    puts "Using all words with token frequency in range of #{min_prop} and #{max_prop}."
  end
  words = doc.lines.join(' ').split(/\s+/)
  nwords = words.size
  puts "Found #{nwords} words." if (defined?(VERBOSE) and VERBOSE)
  if min_prop > 0 or max_prop < 1
    word_hash = Hash.new(0)
    words.each {|w| word_hash[w] += 1 }
    min_t, max_t = (min_prop * nwords).round, (max_prop * nwords).round
    words = word_hash.delete_if { |w,c| c < min_t or c > max_t }.keys
  else
    words.uniq!
  end
  # words = words - doc.targets
  if opts.key?('stop_file') and !opts['stop_file'].empty?
    begin
      stopwords = File.new(opts['stop_file']).read.split(/\s+/)
    rescue
      abort("Error: could not open stopword file #{opts['stop_file']}!")
    end
    puts "Removing #{stopwords.size} stopwords from context." if (defined?(VERBOSE) and VERBOSE)
    words -= stopwords
  end
  @words = opts.key?('context_size') ? words.sort_by{rand}[0, opts['context_size']] : words
  index_words
  puts "Using #{@words.size} words as context." if (defined?(VERBOSE) and VERBOSE)
end