Class: Rover::DataFrame

Inherits:
Object
  • Object
show all
Defined in:
lib/rover/data_frame.rb

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ DataFrame

Returns a new instance of DataFrame.



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/rover/data_frame.rb', line 3

def initialize(*args)
  data, options = process_args(args)

  @vectors = {}
  types = options[:types] || {}

  if data.is_a?(DataFrame)
    data.vectors.each do |k, v|
      @vectors[k] = v
    end
  elsif data.is_a?(Hash)
    data.to_h.each do |k, v|
      @vectors[k] =
        if v.respond_to?(:to_a)
          Vector.new(v, type: types[k])
        else
          v
        end
    end

    # handle scalars
    size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
    @vectors.each_key do |k|
      @vectors[k] = to_vector(@vectors[k], size: size, type: types[k])
    end
  elsif data.is_a?(Array)
    vectors = {}
    raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
    keys = data.flat_map(&:keys).uniq
    keys.each do |k|
      vectors[k] = []
    end
    data.each do |d|
      keys.each do |k|
        vectors[k] << d[k]
      end
    end
    vectors.each do |k, v|
      @vectors[k] = to_vector(v, type: types[k])
    end
  elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
    result = data.connection.select_all(data.all.to_sql)
    result.columns.each_with_index do |k, i|
      @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
    end
  else
    raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
  end

  # check keys
  @vectors.each_key do |k|
    check_key(k)
  end

  # check sizes
  sizes = @vectors.values.map(&:size).uniq
  if sizes.size > 1
    raise ArgumentError, "Different sizes: #{sizes}"
  end
end

Instance Method Details

#+(other) ⇒ Object



312
313
314
# File 'lib/rover/data_frame.rb', line 312

def +(other)
  dup.concat(other)
end

#==(other) ⇒ Object

don’t check types



353
354
355
356
357
# File 'lib/rover/data_frame.rb', line 353

def ==(other)
  size == other.size &&
  keys == other.keys &&
  keys.all? { |k| self[k] == other[k] }
end

#[](where) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/rover/data_frame.rb', line 64

def [](where)
  if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } )
    new_vectors = {}
    @vectors.each do |k, v|
      new_vectors[k] = v[where]
    end
    DataFrame.new(new_vectors)
  elsif where.is_a?(Array)
    # multiple columns
    df = DataFrame.new
    where.each do |k|
      df[k] = @vectors[k]
    end
    df
  else
    # single column
    @vectors[where]
  end
end

#[]=(k, v) ⇒ Object

Raises:

  • (ArgumentError)


100
101
102
103
104
105
# File 'lib/rover/data_frame.rb', line 100

def []=(k, v)
  check_key(k)
  v = to_vector(v, size: size)
  raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
  @vectors[k] = v
end

#any?Boolean

should this check for columns as well?

Returns:

  • (Boolean)


114
115
116
# File 'lib/rover/data_frame.rb', line 114

def any?
  size > 0
end

#clearObject



123
124
125
# File 'lib/rover/data_frame.rb', line 123

def clear
  @vectors.clear
end

#concat(other) ⇒ Object

in-place, like Array#concat TODO make more performant

Raises:

  • (ArgumentError)


318
319
320
321
322
323
324
325
326
327
328
329
# File 'lib/rover/data_frame.rb', line 318

def concat(other)
  raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame)

  size = self.size
  vectors.each do |k, v|
    @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size))
  end
  (other.vector_names - vector_names).each do |k|
    @vectors[k] = Vector.new([nil] * size + other[k].to_a)
  end
  self
end

#delete(key) ⇒ Object



137
138
139
# File 'lib/rover/data_frame.rb', line 137

def delete(key)
  @vectors.delete(key)
end

#dupObject



304
305
306
307
308
309
310
# File 'lib/rover/data_frame.rb', line 304

def dup
  df = DataFrame.new
  @vectors.each do |k, v|
    df[k] = v
  end
  df
end

#each_rowObject

return each row as a hash



85
86
87
88
89
# File 'lib/rover/data_frame.rb', line 85

def each_row
  size.times do |i|
    yield @vectors.map { |k, v| [k, v[i]] }.to_h
  end
end

#empty?Boolean

should this check for columns as well?

Returns:

  • (Boolean)


119
120
121
# File 'lib/rover/data_frame.rb', line 119

def empty?
  size == 0
end

#except(*keys) ⇒ Object



141
142
143
# File 'lib/rover/data_frame.rb', line 141

def except(*keys)
  dup.except!(*keys)
end

#except!(*keys) ⇒ Object



145
146
147
148
149
150
# File 'lib/rover/data_frame.rb', line 145

def except!(*keys)
  keys.each do |key|
    delete(key)
  end
  self
end

#first(n = nil) ⇒ Object



164
165
166
167
168
169
170
# File 'lib/rover/data_frame.rb', line 164

def first(n = nil)
  new_vectors = {}
  @vectors.each do |k, v|
    new_vectors[k] = v.first(n)
  end
  DataFrame.new(new_vectors)
end

#group(columns) ⇒ Object



293
294
295
# File 'lib/rover/data_frame.rb', line 293

def group(columns)
  Group.new(self, columns)
end

#head(n = 5) ⇒ Object



156
157
158
# File 'lib/rover/data_frame.rb', line 156

def head(n = 5)
  first(n)
end

#include?(key) ⇒ Boolean

Returns:

  • (Boolean)


152
153
154
# File 'lib/rover/data_frame.rb', line 152

def include?(key)
  @vectors.include?(key)
end

#inner_join(other, on: nil) ⇒ Object

see join for options



343
344
345
# File 'lib/rover/data_frame.rb', line 343

def inner_join(other, on: nil)
  join(other, on: on, how: "inner")
end

#inspectObject Also known as: to_s

TODO handle long text better



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/rover/data_frame.rb', line 243

def inspect
  return "#<Rover::DataFrame>" if keys.empty?

  lines = []
  line_start = 0
  spaces = 2

  @vectors.each do |k, v|
    v = v.first(5).to_a
    width = ([k] + v).map(&:to_s).map(&:size).max
    width = 3 if width < 3

    if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120
      line_start = lines.size
      lines << []
      [size, 5].min.times do |i|
        lines << []
      end
      lines << [] if size > 5
      lines << []
    end

    lines[line_start] << "%#{width}s" % k.to_s
    v.each_with_index do |v2, i|
      lines[line_start + 1 + i] << "%#{width}s" % v2.to_s
    end
    lines[line_start + 6] << "%#{width}s" % "..." if size > 5
  end

  lines.pop
  lines.map { |l| l.join(" " * spaces) }.join("\n")
end

#keysObject Also known as: names, vector_names



131
132
133
# File 'lib/rover/data_frame.rb', line 131

def keys
  @vectors.keys
end

#last(n = nil) ⇒ Object



172
173
174
175
176
177
178
# File 'lib/rover/data_frame.rb', line 172

def last(n = nil)
  new_vectors = {}
  @vectors.each do |k, v|
    new_vectors[k] = v.last(n)
  end
  DataFrame.new(new_vectors)
end

#left_join(other, on: nil) ⇒ Object

see join for options



348
349
350
# File 'lib/rover/data_frame.rb', line 348

def left_join(other, on: nil)
  join(other, on: on, how: "left")
end

#merge(other) ⇒ Object



331
332
333
# File 'lib/rover/data_frame.rb', line 331

def merge(other)
  dup.merge!(other)
end

#merge!(other) ⇒ Object



335
336
337
338
339
340
# File 'lib/rover/data_frame.rb', line 335

def merge!(other)
  other.vectors.each do |k, v|
    self[k] = v
  end
  self
end

#one_hot(drop: false) ⇒ Object

TODO raise error when collision



207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/rover/data_frame.rb', line 207

def one_hot(drop: false)
  df = DataFrame.new
  vectors.each do |k, v|
    if v.to_numo.is_a?(Numo::RObject)
      df.merge!(v.one_hot(drop: drop, prefix: "#{k}_"))
    else
      df[k] = v
    end
  end
  df
rescue ArgumentError => e
  if e.message == "All elements must be strings"
    # better error message
    raise ArgumentError, "All elements must be numeric or strings"
  end
  raise e
end

#sample(*args, **kwargs) ⇒ Object



180
181
182
183
184
# File 'lib/rover/data_frame.rb', line 180

def sample(*args, **kwargs)
  # TODO make more efficient
  indexes = (0...size).to_a.sample(*args, **kwargs)
  self[indexes]
end

#shapeObject



127
128
129
# File 'lib/rover/data_frame.rb', line 127

def shape
  [size, @vectors.size]
end

#sizeObject Also known as: length, count



107
108
109
# File 'lib/rover/data_frame.rb', line 107

def size
  @vectors.values.first&.size || 0
end

#sort_by(&block) ⇒ Object



289
290
291
# File 'lib/rover/data_frame.rb', line 289

def sort_by(&block)
  dup.sort_by!(&block)
end

#sort_by!Object



277
278
279
280
281
282
283
284
285
286
287
# File 'lib/rover/data_frame.rb', line 277

def sort_by!
  indexes =
    size.times.sort_by do |i|
      yield @vectors.map { |k, v| [k, v[i]] }.to_h
    end

  @vectors.each do |k, v|
    self[k] = v.to_numo.at(indexes)
  end
  self
end

#tail(n = 5) ⇒ Object



160
161
162
# File 'lib/rover/data_frame.rb', line 160

def tail(n = 5)
  last(n)
end

#to_aObject



186
187
188
189
190
191
192
# File 'lib/rover/data_frame.rb', line 186

def to_a
  a = []
  each_row do |row|
    a << row
  end
  a
end

#to_csvObject



225
226
227
228
229
230
231
232
233
234
# File 'lib/rover/data_frame.rb', line 225

def to_csv
  require "csv"
  CSV.generate do |csv|
    csv << keys
    numo = vectors.values.map(&:to_numo)
    size.times do |i|
      csv << numo.map { |n| n[i] }
    end
  end
end

#to_hObject



194
195
196
197
198
199
200
# File 'lib/rover/data_frame.rb', line 194

def to_h
  hsh = {}
  @vectors.each do |k, v|
    hsh[k] = v.to_a
  end
  hsh
end

#to_htmlObject

for IRuby



237
238
239
240
# File 'lib/rover/data_frame.rb', line 237

def to_html
  require "iruby"
  IRuby::HTML.table(to_h)
end

#to_numoObject



202
203
204
# File 'lib/rover/data_frame.rb', line 202

def to_numo
  Numo::NArray.column_stack(vectors.values.map(&:to_numo))
end

#typesObject



96
97
98
# File 'lib/rover/data_frame.rb', line 96

def types
  @vectors.map { |k, v| [k, v.type] }.to_h
end

#vectorsObject

dup to prevent direct modification of keys



92
93
94
# File 'lib/rover/data_frame.rb', line 92

def vectors
  @vectors.dup
end