Method: Cass::Context#initialize
- Defined in:
- lib/cass/context.rb
#initialize(doc, opts) ⇒ Context
Returns a new instance of Context.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/cass/context.rb', line 8 def initialize(doc, opts) min_prop = opts['min_prop'] || 0 max_prop = opts['max_prop'] || 1 if (defined?(VERBOSE) and VERBOSE) puts "Creating new context..." puts "Using all words with token frequency in range of #{min_prop} and #{max_prop}." end words = doc.lines.join(' ').split(/\s+/) nwords = words.size puts "Found #{nwords} words." if (defined?(VERBOSE) and VERBOSE) if min_prop > 0 or max_prop < 1 word_hash = Hash.new(0) words.each {|w| word_hash[w] += 1 } min_t, max_t = (min_prop * nwords).round, (max_prop * nwords).round words = word_hash.delete_if { |w,c| c < min_t or c > max_t }.keys else words.uniq! end # words = words - doc.targets if opts.key?('stop_file') and !opts['stop_file'].empty? begin stopwords = File.new(opts['stop_file']).read.split(/\s+/) rescue abort("Error: could not open stopword file #{opts['stop_file']}!") end puts "Removing #{stopwords.size} stopwords from context." if (defined?(VERBOSE) and VERBOSE) words -= stopwords end @words = opts.key?('context_size') ? words.sort_by{rand}[0, opts['context_size']] : words index_words puts "Using #{@words.size} words as context." if (defined?(VERBOSE) and VERBOSE) end |