Class: Rover::DataFrame

Inherits:
Object
  • Object
show all
Defined in:
lib/rover/data_frame.rb

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ DataFrame

Returns a new instance of DataFrame.



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/rover/data_frame.rb', line 3

def initialize(*args)
  data, options = process_args(args)

  @vectors = {}
  types = options[:types] || {}

  if data.is_a?(DataFrame)
    data.vectors.each do |k, v|
      @vectors[k] = v
    end
  elsif data.is_a?(Hash)
    data.to_h.each do |k, v|
      @vectors[k] =
        if v.respond_to?(:to_a)
          Vector.new(v, type: types[k])
        else
          v
        end
    end

    # handle scalars
    size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
    @vectors.each_key do |k|
      @vectors[k] = to_vector(@vectors[k], size: size, type: types[k])
    end
  elsif data.is_a?(Array)
    vectors = {}
    raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
    keys = data.flat_map(&:keys).uniq
    keys.each do |k|
      vectors[k] = []
    end
    data.each do |d|
      keys.each do |k|
        vectors[k] << d[k]
      end
    end
    vectors.each do |k, v|
      @vectors[k] = to_vector(v, type: types[k])
    end
  elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
    result = data.connection.select_all(data.all.to_sql)
    result.columns.each_with_index do |k, i|
      @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
    end
  else
    raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
  end

  # check keys
  @vectors.each_key do |k|
    check_key(k)
  end

  # check sizes
  sizes = @vectors.values.map(&:size).uniq
  if sizes.size > 1
    raise ArgumentError, "Different sizes: #{sizes}"
  end
end

Instance Method Details

#+(other) ⇒ Object



318
319
320
# File 'lib/rover/data_frame.rb', line 318

def +(other)
  dup.concat(other)
end

#==(other) ⇒ Object

don’t check types



359
360
361
362
363
# File 'lib/rover/data_frame.rb', line 359

def ==(other)
  size == other.size &&
  keys == other.keys &&
  keys.all? { |k| self[k] == other[k] }
end

#[](where) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/rover/data_frame.rb', line 64

def [](where)
  if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } )
    new_vectors = {}
    @vectors.each do |k, v|
      new_vectors[k] = v[where]
    end
    DataFrame.new(new_vectors)
  elsif where.is_a?(Array)
    # multiple columns
    df = DataFrame.new
    where.each do |k|
      df[k] = @vectors[k]
    end
    df
  else
    # single column
    @vectors[where]
  end
end

#[]=(k, v) ⇒ Object

Raises:

  • (ArgumentError)


101
102
103
104
105
106
# File 'lib/rover/data_frame.rb', line 101

def []=(k, v)
  check_key(k)
  v = to_vector(v, size: size)
  raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
  @vectors[k] = v
end

#any?Boolean

should this check for columns as well?

Returns:

  • (Boolean)


115
116
117
# File 'lib/rover/data_frame.rb', line 115

def any?
  size > 0
end

#clearObject



124
125
126
# File 'lib/rover/data_frame.rb', line 124

def clear
  @vectors.clear
end

#concat(other) ⇒ Object

in-place, like Array#concat TODO make more performant

Raises:

  • (ArgumentError)


324
325
326
327
328
329
330
331
332
333
334
335
# File 'lib/rover/data_frame.rb', line 324

def concat(other)
  raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame)

  size = self.size
  vectors.each do |k, v|
    @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size))
  end
  (other.vector_names - vector_names).each do |k|
    @vectors[k] = Vector.new([nil] * size + other[k].to_a)
  end
  self
end

#delete(key) ⇒ Object



138
139
140
# File 'lib/rover/data_frame.rb', line 138

def delete(key)
  @vectors.delete(key)
end

#dupObject



310
311
312
313
314
315
316
# File 'lib/rover/data_frame.rb', line 310

def dup
  df = DataFrame.new
  @vectors.each do |k, v|
    df[k] = v
  end
  df
end

#each_rowObject



84
85
86
87
88
89
90
# File 'lib/rover/data_frame.rb', line 84

def each_row
  return enum_for(:each_row) unless block_given?

  size.times do |i|
    yield @vectors.map { |k, v| [k, v[i]] }.to_h
  end
end

#empty?Boolean

should this check for columns as well?

Returns:

  • (Boolean)


120
121
122
# File 'lib/rover/data_frame.rb', line 120

def empty?
  size == 0
end

#except(*keys) ⇒ Object



142
143
144
# File 'lib/rover/data_frame.rb', line 142

def except(*keys)
  dup.except!(*keys)
end

#except!(*keys) ⇒ Object



146
147
148
149
150
151
# File 'lib/rover/data_frame.rb', line 146

def except!(*keys)
  keys.each do |key|
    delete(key)
  end
  self
end

#first(n = nil) ⇒ Object



165
166
167
168
169
170
171
# File 'lib/rover/data_frame.rb', line 165

def first(n = nil)
  new_vectors = {}
  @vectors.each do |k, v|
    new_vectors[k] = v.first(n)
  end
  DataFrame.new(new_vectors)
end

#group(*columns) ⇒ Object



299
300
301
# File 'lib/rover/data_frame.rb', line 299

def group(*columns)
  Group.new(self, columns.flatten)
end

#head(n = 5) ⇒ Object



157
158
159
# File 'lib/rover/data_frame.rb', line 157

def head(n = 5)
  first(n)
end

#include?(key) ⇒ Boolean

Returns:

  • (Boolean)


153
154
155
# File 'lib/rover/data_frame.rb', line 153

def include?(key)
  @vectors.include?(key)
end

#inner_join(other, on: nil) ⇒ Object

see join for options



349
350
351
# File 'lib/rover/data_frame.rb', line 349

def inner_join(other, on: nil)
  join(other, on: on, how: "inner")
end

#inspectObject Also known as: to_s

TODO handle long text better



249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/rover/data_frame.rb', line 249

def inspect
  return "#<Rover::DataFrame>" if keys.empty?

  lines = []
  line_start = 0
  spaces = 2

  summarize = size >= 30

  @vectors.each do |k, v|
    v = summarize ? v.first(5).to_a + ["..."] + v.last(5).to_a : v.to_a
    width = ([k] + v).map(&:to_s).map(&:size).max
    width = 3 if width < 3

    if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120
      line_start = lines.size
      lines << []
      v.size.times do |i|
        lines << []
      end
      lines << []
    end

    lines[line_start] << "%#{width}s" % k.to_s
    v.each_with_index do |v2, i|
      lines[line_start + 1 + i] << "%#{width}s" % v2.to_s
    end
  end

  lines.pop
  lines.map { |l| l.join(" " * spaces) }.join("\n")
end

#keysObject Also known as: names, vector_names



132
133
134
# File 'lib/rover/data_frame.rb', line 132

def keys
  @vectors.keys
end

#last(n = nil) ⇒ Object



173
174
175
176
177
178
179
# File 'lib/rover/data_frame.rb', line 173

def last(n = nil)
  new_vectors = {}
  @vectors.each do |k, v|
    new_vectors[k] = v.last(n)
  end
  DataFrame.new(new_vectors)
end

#left_join(other, on: nil) ⇒ Object

see join for options



354
355
356
# File 'lib/rover/data_frame.rb', line 354

def left_join(other, on: nil)
  join(other, on: on, how: "left")
end

#merge(other) ⇒ Object



337
338
339
# File 'lib/rover/data_frame.rb', line 337

def merge(other)
  dup.merge!(other)
end

#merge!(other) ⇒ Object



341
342
343
344
345
346
# File 'lib/rover/data_frame.rb', line 341

def merge!(other)
  other.vectors.each do |k, v|
    self[k] = v
  end
  self
end

#one_hot(drop: false) ⇒ Object

TODO raise error when collision



208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/rover/data_frame.rb', line 208

def one_hot(drop: false)
  df = DataFrame.new
  vectors.each do |k, v|
    if v.to_numo.is_a?(Numo::RObject)
      df.merge!(v.one_hot(drop: drop, prefix: "#{k}_"))
    else
      df[k] = v
    end
  end
  df
rescue ArgumentError => e
  if e.message == "All elements must be strings"
    # better error message
    raise ArgumentError, "All elements must be numeric or strings"
  end
  raise e
end

#plot(x = nil, y = nil, type: nil) ⇒ Object

Raises:

  • (ArgumentError)


365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
# File 'lib/rover/data_frame.rb', line 365

def plot(x = nil, y = nil, type: nil)
  require "vega"

  raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
  x ||= keys[0]
  y ||= keys[1]
  type ||= begin
    if self[x].numeric? && self[y].numeric?
      "scatter"
    elsif types[x] == :object && self[y].numeric?
      "column"
    else
      raise "Cannot determine type"
    end
  end
  data = self[[x, y]]

  case type
  when "scatter"
    Vega.lite
      .data(data)
      .mark(type: "circle", tooltip: true)
      .encoding(
        x: {field: x, type: "quantitative", scale: {zero: false}},
        y: {field: y, type: "quantitative", scale: {zero: false}},
        size: {value: 60}
      )
      .config(axis: {title: nil, labelFontSize: 12})
  when "column"
    Vega.lite
      .data(data)
      .mark(type: "bar", tooltip: true)
      .encoding(
        # TODO determine label angle
        x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
        y: {field: y, type: "quantitative"}
      )
      .config(axis: {title: nil, labelFontSize: 12})
  else
    raise ArgumentError, "Invalid type: #{type}"
  end
end

#sample(*args, **kwargs) ⇒ Object



181
182
183
184
185
# File 'lib/rover/data_frame.rb', line 181

def sample(*args, **kwargs)
  # TODO make more efficient
  indexes = (0...size).to_a.sample(*args, **kwargs)
  self[indexes]
end

#shapeObject



128
129
130
# File 'lib/rover/data_frame.rb', line 128

def shape
  [size, @vectors.size]
end

#sizeObject Also known as: length, count



108
109
110
# File 'lib/rover/data_frame.rb', line 108

def size
  @vectors.values.first&.size || 0
end

#sort_by(&block) ⇒ Object



295
296
297
# File 'lib/rover/data_frame.rb', line 295

def sort_by(&block)
  dup.sort_by!(&block)
end

#sort_by!Object



283
284
285
286
287
288
289
290
291
292
293
# File 'lib/rover/data_frame.rb', line 283

def sort_by!
  indexes =
    size.times.sort_by do |i|
      yield @vectors.map { |k, v| [k, v[i]] }.to_h
    end

  @vectors.each do |k, v|
    self[k] = v.to_numo.at(indexes)
  end
  self
end

#tail(n = 5) ⇒ Object



161
162
163
# File 'lib/rover/data_frame.rb', line 161

def tail(n = 5)
  last(n)
end

#to_aObject



187
188
189
190
191
192
193
# File 'lib/rover/data_frame.rb', line 187

def to_a
  a = []
  each_row do |row|
    a << row
  end
  a
end

#to_csvObject



226
227
228
229
230
231
232
233
234
235
# File 'lib/rover/data_frame.rb', line 226

def to_csv
  require "csv"
  CSV.generate do |csv|
    csv << keys
    numo = vectors.values.map(&:to_numo)
    size.times do |i|
      csv << numo.map { |n| n[i] }
    end
  end
end

#to_hObject



195
196
197
198
199
200
201
# File 'lib/rover/data_frame.rb', line 195

def to_h
  hsh = {}
  @vectors.each do |k, v|
    hsh[k] = v.to_a
  end
  hsh
end

#to_htmlObject

for IRuby



238
239
240
241
242
243
244
245
246
# File 'lib/rover/data_frame.rb', line 238

def to_html
  require "iruby"
  if size > 7
    # pass 8 rows so maxrows is applied
    IRuby::HTML.table((self[0..4] + self[-4..-1]).to_h, maxrows: 7)
  else
    IRuby::HTML.table(to_h)
  end
end

#to_numoObject



203
204
205
# File 'lib/rover/data_frame.rb', line 203

def to_numo
  Numo::NArray.column_stack(vectors.values.map(&:to_numo))
end

#typesObject



97
98
99
# File 'lib/rover/data_frame.rb', line 97

def types
  @vectors.map { |k, v| [k, v.type] }.to_h
end

#vectorsObject

dup to prevent direct modification of keys



93
94
95
# File 'lib/rover/data_frame.rb', line 93

def vectors
  @vectors.dup
end