Class: FeatureSet::FeatureBuilders::WordVector
- Defined in:
- lib/feature_set/feature_builders/word_vector.rb
Instance Attribute Summary collapse
-
#idfs ⇒ Object
Returns the value of attribute idfs.
Attributes inherited from Base
Instance Method Summary collapse
- #before_build_features(dataset) ⇒ Object
- #build_features(datum, key, row) ⇒ Object
-
#initialize(options = {}) ⇒ WordVector
constructor
Options: :tf_only => true|false, default is false :idf_cutiff => <cutoff>, default is 10 :word_limit => <word limit>, default is 2000.
Constructor Details
#initialize(options = {}) ⇒ WordVector
Options:
:tf_only => true|false, default is false
:idf_cutiff => <cutoff>, default is 10
:word_limit => <word limit>, default is 2000
12 13 14 15 |
# File 'lib/feature_set/feature_builders/word_vector.rb', line 12 def initialize( = {}) super @idfs = {} end |
Instance Attribute Details
#idfs ⇒ Object
Returns the value of attribute idfs.
6 7 8 |
# File 'lib/feature_set/feature_builders/word_vector.rb', line 6 def idfs @idfs end |
Instance Method Details
#before_build_features(dataset) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/feature_set/feature_builders/word_vector.rb', line 17 def before_build_features(dataset) @idfs = {} dataset.each do |row| row.each do |key, datum| next if key == :class if datum.value.is_a?(String) idfs[key] ||= {} datum.token_counts.keys.each do |token| idfs[key][token] ||= 0 idfs[key][token] += 1 end end end end num_docs = dataset.length idf_cutoff = ([:idf_cutoff] || 10).to_f word_limit = [:word_limit] || 2000 STDERR.puts "Done building df counts. The dataset has #{num_docs} documents." idfs.each do |feature, freqs| pruned = 0 if [:tf_only] new_freqs = freqs else new_freqs = {} freqs.each do |key, value| log = Math.log(num_docs / value.to_f) if log < idf_cutoff new_freqs[key] = log else pruned += 1 end end end if [:word_limit] new_freqs = if [:tf_only] new_freqs.to_a.sort {|a, b| b.last <=> a.last } else new_freqs.to_a.sort {|a, b| a.last <=> b.last } end new_freqs = new_freqs[0...word_limit].inject({}) { |m, (k, v)| m[k] = v; m } end idfs[feature] = new_freqs STDERR.puts "Done calculating idfs for #{feature}. Pruned #{pruned} rare values, leaving #{idfs[feature].length} values." end end |
#build_features(datum, key, row) ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/feature_set/feature_builders/word_vector.rb', line 65 def build_features(datum, key, row) return {} unless datum.value.is_a?(String) num_words = datum.tokens.length.to_f unless idfs[key] STDERR.puts "WARNING: build_features called on untrained data in WordVector. Are you calling 'data_set.build_features_for' without calling 'data_set.build_features_from_data!' first?" end if [:tf_only] (idfs[key] || {}).inject({}) do |memo, (word, idf)| memo["wv_#{word}"] = ((datum.token_counts[word] || 0) / num_words) memo end else (idfs[key] || {}).inject({}) do |memo, (word, idf)| memo["wv_#{word}"] = ((datum.token_counts[word] || 0) / num_words) * idf memo end end end |