Class: FeatureSet::DataSet
- Inherits:
-
Object
- Object
- FeatureSet::DataSet
- Defined in:
- lib/feature_set/data_set.rb
Constant Summary collapse
- BUILTIN_FEATURE_BUILDERS =
%w[FeatureSet::FeatureBuilders::Cuss FeatureSet::FeatureBuilders::Emoticon FeatureSet::FeatureBuilders::WordVector].map(&:constantize)
Instance Attribute Summary collapse
-
#data ⇒ Object
Returns the value of attribute data.
-
#feature_builders ⇒ Object
Returns the value of attribute feature_builders.
-
#features ⇒ Object
Returns the value of attribute features.
-
#name ⇒ Object
Returns the value of attribute name.
-
#options ⇒ Object
Returns the value of attribute options.
Class Method Summary collapse
Instance Method Summary collapse
- #add_data(data) ⇒ Object
- #add_feature_builders(*builders) ⇒ Object (also: #add_feature_builder)
- #build_features_for(data, opts = {}) ⇒ Object
- #build_features_from_data!(opts = {}) ⇒ Object
- #clear_data ⇒ Object
- #clear_features ⇒ Object
- #dump_feature_builders ⇒ Object
-
#initialize(options = {}) ⇒ DataSet
constructor
A new instance of DataSet.
- #load_feature_builders(serialized_builders) ⇒ Object
-
#output_numeric_arff(io) ⇒ Object
This only knows how to output arfs with true/false classes and all numeric attributes.
- #to_rarff ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ DataSet
Returns a new instance of DataSet.
18 19 20 21 22 23 24 |
# File 'lib/feature_set/data_set.rb', line 18 def initialize( = {}) @options = @name = [:name] @feature_builders = [] @features = [] @data = [] end |
Instance Attribute Details
#data ⇒ Object
Returns the value of attribute data.
16 17 18 |
# File 'lib/feature_set/data_set.rb', line 16 def data @data end |
#feature_builders ⇒ Object
Returns the value of attribute feature_builders.
16 17 18 |
# File 'lib/feature_set/data_set.rb', line 16 def feature_builders @feature_builders end |
#features ⇒ Object
Returns the value of attribute features.
16 17 18 |
# File 'lib/feature_set/data_set.rb', line 16 def features @features end |
#name ⇒ Object
Returns the value of attribute name.
16 17 18 |
# File 'lib/feature_set/data_set.rb', line 16 def name @name end |
#options ⇒ Object
Returns the value of attribute options.
16 17 18 |
# File 'lib/feature_set/data_set.rb', line 16 def @options end |
Class Method Details
Instance Method Details
#add_data(data) ⇒ Object
26 27 28 |
# File 'lib/feature_set/data_set.rb', line 26 def add_data(data) (@data << data).flatten! end |
#add_feature_builders(*builders) ⇒ Object Also known as: add_feature_builder
114 115 116 117 |
# File 'lib/feature_set/data_set.rb', line 114 def add_feature_builders(*builders) builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first) (@feature_builders << builders).flatten! end |
#build_features_for(data, opts = {}) ⇒ Object
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/feature_set/data_set.rb', line 81 def build_features_for(data, opts = {}) # FYI, we explicitly do not call before_build_features because this can be used on unknown rows for classification, and # we want our feature builders to keep any cached data from the previous 'build_features_from_data!' call. This is important for # Wordvector, for example, since it needs to build the idf mappings beforehand and needs to re-use them on any new data. wrapped_data = opts[:already_wrapped] ? data : self.class.wrap_dataset(data) wrapped_data.map.with_index do |row, index| output_row = {} row.each do |key, datum| if key == :class output_row[:class] = datum next end if opts[:include_original] && (opts[:include_original].is_a?(TrueClass) || ![opts[:include_original][:except]].flatten.include?(key)) output_row[key] = datum.value end feature_builders.each do |builder| builder.build_features(datum, key, row).each do |feature, value| output_row["#{key}_#{feature}".to_sym] = value end end end if index % 10 == 0 STDERR.print "."; STDERR.flush end output_row end end |
#build_features_from_data!(opts = {}) ⇒ Object
75 76 77 78 79 |
# File 'lib/feature_set/data_set.rb', line 75 def build_features_from_data!(opts = {}) wrapped_data = self.class.wrap_dataset(data) feature_builders.each {|fb| fb.before_build_features(wrapped_data) } @features = build_features_for(wrapped_data, opts.merge(:already_wrapped => true)) end |
#clear_data ⇒ Object
30 31 32 |
# File 'lib/feature_set/data_set.rb', line 30 def clear_data @data = [] end |
#clear_features ⇒ Object
34 35 36 |
# File 'lib/feature_set/data_set.rb', line 34 def clear_features @features = [] end |
#dump_feature_builders ⇒ Object
121 122 123 |
# File 'lib/feature_set/data_set.rb', line 121 def dump_feature_builders Marshal.dump(feature_builders) end |
#load_feature_builders(serialized_builders) ⇒ Object
125 126 127 128 |
# File 'lib/feature_set/data_set.rb', line 125 def load_feature_builders(serialized_builders) clear_features self.feature_builders = Marshal.load(serialized_builders) end |
#output_numeric_arff(io) ⇒ Object
This only knows how to output arfs with true/false classes and all numeric attributes. Additionally, every row must have the same attributes.
62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/feature_set/data_set.rb', line 62 def output_numeric_arff(io) keys = features.first.keys io.puts "@RELATION Data" keys.each do |key| io.puts "@ATTRIBUTE #{key} NUMERIC" unless key == :class end io.puts "@ATTRIBUTE class {false,true}" io.puts "@DATA" features.each do |feature| io.puts keys.map { |k| k == :class ? feature[k].to_s : feature[k].to_f }.join(",") end end |
#to_rarff ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/feature_set/data_set.rb', line 38 def to_rarff relation = Rarff::Relation.new(name || 'Data') keys = features.first.keys instances = features.map do |row| keys.map do |key| value = row[key] if value.is_a?(String) value.gsub(/\\/, "\\\\\\\\").gsub(/"/, "\\\\\"").gsub(/'/, '\\\\\'') elsif value.is_a?(Symbol) value.to_s else value end end end relation.instances = instances keys.each_with_index do |key, index| relation.attributes[index].name = key.to_s end relation end |