Class: Treat::Learning::DataSet

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/learning/data_set.rb

Overview

A DataSet contains an entity classification problem as well as data for entities that have already been classified, complete with references to these entities.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(problem) ⇒ DataSet

Initialize the DataSet.



15
16
17
18
19
20
21
22
# File 'lib/treat/learning/data_set.rb', line 15

def initialize(problem)
  unless problem.is_a?(Treat::Learning::Problem)
    raise Treat::Exception, "The first argument " +
    "to initialize should be an instance of " +
    "Treat::Learning::Problem."
  end
  @problem, @items = problem, []
end

Instance Attribute Details

#itemsObject

Items that have been already classified (training data).



12
13
14
# File 'lib/treat/learning/data_set.rb', line 12

def items
  @items
end

#problemObject

The classification problem this data set holds data for.



9
10
11
# File 'lib/treat/learning/data_set.rb', line 9

def problem
  @problem
end

Class Method Details

.build(from) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/treat/learning/data_set.rb', line 24

def self.build(from)
  if from.is_a?(Hash)
    Treat::Learning::DataSet.unserialize(
    Treat.databases.default.adapter, from)
  elsif from.is_a?(String)
    unless File.readable?(from)
      raise Treat::Exception,
      "Attempting to initialize data set from " +
      "file '#{from}', but it is not readable."
    end
    Treat::Learning::DataSet.unserialize(
    File.extname(from)[1..-1], file: from)
  end
end

.from_marshal(options) ⇒ Object

Unserialize the data using Marshal.



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/treat/learning/data_set.rb', line 83

def self.from_marshal(options)
  file = options[:file]
  data = Marshal.load(File.binread(file))
  problem, items = *data
  problem.features.each do |feature|
    next unless feature.proc_string
    feature.proc = eval(feature.proc_string)
  end
  problem.tags.each do |tag|
    next unless tag.proc_string
    tag.proc = eval(tag.proc_string)
  end
  data_set = Treat::Learning::DataSet.new(problem)
  data_set.items = items
  data_set
end

.from_mongo(options) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/treat/learning/data_set.rb', line 123

def self.from_mongo(options)
  require 'mongo'
  host = options.delete(:host) || Treat.databases.mongo.host
  db = options.delete(:db) || Treat.databases.mongo.db
  database = Mongo::Connection.new(host).db(db)
  p_record = database.collection('problems').
  find_one(id: options[:problem])
  unless p_record
    raise Treat::Exception, 
    "Couldn't retrieve problem ID #{options[:problem]}."
  end
  problem = Treat::Learning::Problem.from_hash(p_record)
  data = database.collection('data').find(options).to_a
  items = []
  data.each do |datum|
    datum.delete("_id"); datum.delete('problem')
    item = {}
    item[:features] = datum['features'].values
    item[:tags] = datum['tags'].values
    item[:id] = datum['id']
    items << item
  end
  data_set = Treat::Learning::DataSet.new(problem)
  data_set.items = items
  data_set
end

.unserialize(handler, options) ⇒ Object

Unserialize a data set file created by using the #serialize method.



62
63
64
# File 'lib/treat/learning/data_set.rb', line 62

def self.unserialize(handler, options)
  self.send("from_#{handler}", options)
end

Instance Method Details

#<<(entity) ⇒ Object

Add an entity to the data set. The entity’s relevant features are calculated based on the classification problem, and a line with the results of the calculation is added to the data set, along with the ID of the entity.



45
46
47
48
49
50
51
52
# File 'lib/treat/learning/data_set.rb', line 45

def <<(entity)
  @items << { 
  tags: (!@problem.tags.empty? ? 
  @problem.export_tags(entity) : []),
  features: @problem.
  export_features(entity),
  id: entity.id }
end

#==(data_set) ⇒ Object

Compare with other data set.



162
163
164
165
# File 'lib/treat/learning/data_set.rb', line 162

def ==(data_set)
  @problem == data_set.problem &&
  @items == data_set.items
end

#merge(data_set) ⇒ Object

Merge another data set into this one.



151
152
153
154
155
156
157
158
159
# File 'lib/treat/learning/data_set.rb', line 151

def merge(data_set)
  if data_set.problem != @problem
    raise Treat::Exception,
    "Cannot merge two data sets that " +
    "don't reference the same problem." 
  else
    @items += data_set.items
  end
end

#serialize(handler, options = {}) ⇒ Object

Serialize the data set to a file, or store it inside the database.



56
57
58
# File 'lib/treat/learning/data_set.rb', line 56

def serialize(handler, options = {})
  send("to_#{handler}", options)
end

#to_marshal(options) ⇒ Object

Serialize the data set using Marshal.



67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/treat/learning/data_set.rb', line 67

def to_marshal(options)
  file = options[:file]
  problem = @problem.dup
  problem.features.each do |feature|
    feature.proc = nil
  end
  problem.tags.each do |tag|
    tag.proc = nil
  end
  data = [problem, @items]
  File.open(file, 'w') do |f| 
    f.write(Marshal.dump(data))
  end
end

#to_mongo(options) ⇒ Object

Serialize the data set to a MongoDB record.



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/treat/learning/data_set.rb', line 101

def to_mongo(options)
  require 'mongo'
  host = options[:host] || Treat.databases.mongo.host
  db = options[:db] || Treat.databases.mongo.db
  # UNLESS HOST, UNLESS DB
  database = Mongo::Connection.new(host).db(db)
  database.collection('problems').update(
  {id: @problem.id}, @problem.to_hash, {upsert: true})
  feature_labels = @problem.feature_labels
  feature_labels << @problem.question.name
  tag_labels = @problem.tag_labels
  tags = @problem.tags.map  { |t| t.name }
  data = database.collection('data')
  pid = @problem.id
  @items.each do |item|
    item[:features] = Hash[feature_labels.zip(item[:features])]
    item[:tags] = Hash[tag_labels.zip(item[:tags])]
    item[:problem] = pid
    data.insert(item)
  end
end