Class: Daru::DataFrame

Inherits:
Object
  • Object
show all
Includes:
Maths::Arithmetic::DataFrame, Maths::Statistics::DataFrame, Plotting::DataFrame
Defined in:
lib/daru/dataframe.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Plotting::DataFrame

#plot

Methods included from Maths::Arithmetic::DataFrame

#%, #*, #+, #-, #/

Constructor Details

#initialize(source, opts = {}) ⇒ DataFrame

DataFrame basically consists of an Array of Vector objects. These objects are indexed by row and column by vectors and index Index objects. Arguments - source, vectors, index, name.

Usage

df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a], 
  index: [:a, :b, :c, :d], name: :spider_man)

# => 
# <Daru::DataFrame:80766980 @name = spider_man @size = 4>
#             b          a 
#  a          6          1 
#  b          7          2 
#  c          8          3 
#  d          9          4


82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/daru/dataframe.rb', line 82

def initialize source, opts={}
  vectors = opts[:order]
  index   = opts[:index]
  @dtype  = opts[:dtype] || Array
  @name   = (opts[:name] || SecureRandom.uuid).to_sym
  @data   = []

  if source.empty?
    @vectors = Daru::Index.new vectors
    @index   = Daru::Index.new index
    create_empty_vectors
  else
    case source
    when Array
      if vectors.nil?
        @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
      else
        @vectors = Daru::Index.new (vectors + (source[0].keys - vectors)).uniq.map(&:to_sym)
      end

      if index.nil?
        @index = Daru::Index.new source.size
      else
        @index = Daru::Index.new index
      end

      @vectors.each do |name|
        v = []
        source.each do |hsh|
          v << (hsh[name] || hsh[name.to_s])
        end

        @data << v.dv(name, @index, @dtype)
      end
    when Hash
      create_vectors_index_with vectors, source
      if all_daru_vectors_in_source? source
        if !index.nil?
          @index = index.to_index
        elsif all_vectors_have_equal_indexes? source
          @index = source.values[0].index.dup
        else
          all_indexes = []
          source.each_value do |vector|
            all_indexes << vector.index.to_a
          end
          # sort only if missing indexes detected
          all_indexes.flatten!.uniq!.sort!

          @index = Daru::Index.new all_indexes
        end
        @vectors.each do |vector|
          @data << Daru::Vector.new([], name: vector, index: @index, dtype: @dtype)

          @index.each do |idx|
            begin
              @data[@vectors[vector]][idx] = source[vector][idx]                   
            rescue IndexError
              # If the index is not present in the vector under consideration
              # (in source) then an error is raised. Put a nil in that place if
              # that is the case.
              @data[@vectors[vector]][idx] = nil                  
            end
          end
        end
      else   
        index = source.values[0].size if index.nil?
        if index.is_a?(Daru::Index)
          @index   = index.to_index
        else
          @index   = Daru::Index.new index     
        end

        @vectors.each do |name|
          @data << source[name].dup.dv(name, @index, @dtype)
        end
      end
    end
  end

  set_size
  validate
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args, &block) ⇒ Object



494
495
496
497
498
499
500
501
502
# File 'lib/daru/dataframe.rb', line 494

def method_missing(name, *args, &block)
  if md = name.match(/(.+)\=/)
    insert_or_modify_vector name[/(.+)\=/].delete("="), args[0]
  elsif self.has_vector? name
    self[name, :vector]
  else
    super(name, *args, &block)
  end
end

Instance Attribute Details

#indexObject (readonly)

The index of the rows of the DataFrame



59
60
61
# File 'lib/daru/dataframe.rb', line 59

def index
  @index
end

#nameObject (readonly)

The name of the DataFrame



62
63
64
# File 'lib/daru/dataframe.rb', line 62

def name
  @name
end

#sizeObject (readonly)

The number of rows present in the DataFrame



65
66
67
# File 'lib/daru/dataframe.rb', line 65

def size
  @size
end

#vectorsObject (readonly)

The vectors (columns) index of the DataFrame



56
57
58
# File 'lib/daru/dataframe.rb', line 56

def vectors
  @vectors
end

Class Method Details

.from_csv(path, opts = {}, &block) ⇒ Object

Load data from a CSV file. Arguments - path, options, block(optional)

Accepts a block for pre-conditioning of CSV data if any.



20
21
22
# File 'lib/daru/dataframe.rb', line 20

def from_csv path, opts={}, &block
  Daru::IO.from_csv path, opts, &block      
end

.rows(source, opts = {}) ⇒ Object

Create DataFrame by specifying rows as an Array of Arrays or Array of Daru::Vector objects.



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/daru/dataframe.rb', line 26

def rows source, opts={}
  if source.all? { |v| v.size == source[0].size }
    first = source[0]
    index = []
    order =
    unless opts[:order]
      if first.is_a?(Daru::Vector) # assume that all are Vectors only
        source.each { |vec| index << vec.name }
        first.index.to_a
      elsif first.is_a?(Array)
        Array.new(first.size) { |i| i.to_s }
      end
    else
      opts[:order]
    end

    opts[:order] = order
    df           = Daru::DataFrame.new({}, opts)
    source.each_with_index do |row,idx|
      df[(index[idx] || idx), :row] = row
    end
  else
    raise SizeError, "All vectors must have same length"
  end

  df
end

Instance Method Details

#==(other) ⇒ Object



489
490
491
492
# File 'lib/daru/dataframe.rb', line 489

def == other
  @index == other.index and @size == other.size and @vectors.all? { |vector|
                        self[vector, :vector] == other[vector, :vector] }
end

#[](*names, axis) ⇒ Object

Access row or vector. Specify name of row/vector followed by axis(:row, :vector). Use of this method is not recommended for accessing rows or vectors. Use df.row for accessing row with index ‘:a’ or df.vector for accessing vector with index ‘:vec’



170
171
172
173
174
175
176
177
178
# File 'lib/daru/dataframe.rb', line 170

def [](*names, axis)
  if axis == :vector
    access_vector *names
  elsif axis == :row
    access_row *names
  else
    raise IndexError, "Expected axis to be row or vector not #{axis}"
  end
end

#[]=(name, axis, vector) ⇒ Object

Insert a new row/vector of the specified name or modify a previous row. Instead of using this method directly, use df.row = [1,2,3] to set/create a row ‘:a’ to [1,2,3], or df.vector = [1,2,3] for vectors.

In case a Daru::Vector is specified after the equality the sign, the indexes of the vector will be matched against the row/vector indexes of the DataFrame before an insertion is performed. Unmatched indexes will be set to nil.



187
188
189
190
191
192
193
194
195
# File 'lib/daru/dataframe.rb', line 187

def []=(name, axis ,vector)
  if axis == :vector
    insert_or_modify_vector name, vector
  elsif axis == :row        
    insert_or_modify_row name, vector
  else
    raise IndexError, "Expected axis to be row or vector, not #{axis}."
  end
end

#delete_row(index) ⇒ Object



307
308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/daru/dataframe.rb', line 307

def delete_row index
  idx = named_index_for index

  if @index.include? idx
    @index = (@index.to_a - [idx]).to_index
    self.each_vector do |vector|
      vector.delete_at idx
    end
  else
    raise IndexError, "Index #{index} does not exist."
  end

  set_size
end

#delete_vector(vector) ⇒ Object

Delete a vector



298
299
300
301
302
303
304
305
# File 'lib/daru/dataframe.rb', line 298

def delete_vector vector
  if @vectors.include? vector
    @data.delete_at @vectors[vector]
    @vectors = Daru::Index.new @vectors.to_a - [vector]
  else
    raise IndexError, "Vector #{vector} does not exist."
  end
end

#dtype=(dtype) ⇒ Object



480
481
482
483
484
485
486
487
# File 'lib/daru/dataframe.rb', line 480

def dtype= dtype
  @dtype = dtype

  @vectors.each do |vec|
    pos = @vectors[vec]
    @data[pos] = @data[pos].coerce(@dtype)
  end
end

#dupObject

Duplicate the DataFrame entirely.



216
217
218
219
220
221
222
223
# File 'lib/daru/dataframe.rb', line 216

def dup
  src = {}
  @vectors.each do |vector|
    src[vector] = @data[@vectors[vector]].dup
  end

  Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name, dtype: @dtype
end

#each_row(&block) ⇒ Object

Iterate over each row



242
243
244
245
246
247
248
# File 'lib/daru/dataframe.rb', line 242

def each_row(&block)
  @index.each do |index|
    yield access_row(index)
  end

  self
end

#each_row_with_index(&block) ⇒ Object



250
251
252
253
254
255
256
# File 'lib/daru/dataframe.rb', line 250

def each_row_with_index(&block)
  @index.each do |index|
    yield access_row(index), index
  end

  self
end

#each_vector(&block) ⇒ Object

Iterate over each vector



226
227
228
229
230
# File 'lib/daru/dataframe.rb', line 226

def each_vector(&block)
  @data.each(&block)

  self
end

#each_vector_with_index(&block) ⇒ Object

Iterate over each vector alongwith the name of the vector



233
234
235
236
237
238
239
# File 'lib/daru/dataframe.rb', line 233

def each_vector_with_index(&block)
  @vectors.each do |vector|
    yield @data[@vectors[vector]], vector
  end 

  self
end

#filter_rows(&block) ⇒ Object

Iterates over each row and retains it in a new DataFrame if the block returns true for that row.



345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# File 'lib/daru/dataframe.rb', line 345

def filter_rows &block
  df = Daru::DataFrame.new({}, order: @vectors.to_a)
  marked = []

  @index.each do |index|
    keep_row = yield access_row(index)
    marked << index if keep_row
  end

  marked.each do |idx|
    df.row[idx] = self[idx, :row]
  end

  df
end

#filter_vectors(&block) ⇒ Object

Iterates over each vector and retains it in a new DataFrame if the block returns true for that vector.



363
364
365
366
367
368
# File 'lib/daru/dataframe.rb', line 363

def filter_vectors &block
  df = self.dup
  df.keep_vector_if &block

  df
end

#has_vector?(name) ⇒ Boolean

Check if a vector is present

Returns:

  • (Boolean)


371
372
373
# File 'lib/daru/dataframe.rb', line 371

def has_vector? name
  !!@vectors[name]
end

#head(quantity = 10) ⇒ Object



375
376
377
# File 'lib/daru/dataframe.rb', line 375

def head quantity=10
  self[0..quantity, :row]
end

#inspect(spacing = 10, threshold = 15) ⇒ Object

Pretty print in a nice table format for the command line (irb)



447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
# File 'lib/daru/dataframe.rb', line 447

def inspect spacing=10, threshold=15
  longest = [@name.to_s.size,
             @vectors.map(&:to_s).map(&:size).max, 
             @index  .map(&:to_s).map(&:size).max,
             @data   .map{ |v|  v.map(&:to_s).map(&:size).max }.max].max

  name      = @name || 'nil'
  content   = ""
  longest   = spacing if longest > spacing
  formatter = "\n"

  (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
  content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " + 
                name.to_s + " @size = " + @size.to_s + ">"
  content += sprintf formatter, "" , *@vectors.map(&:to_s)
  row_num  = 1

  self.each_row_with_index do |row, index|
    content += sprintf formatter, index.to_s, *row.to_hash.values.map { |e| (e || 'nil').to_s }
    row_num += 1
    if row_num > threshold
      dots = []

      (@vectors.size + 1).times { dots << "..." }
      content += sprintf formatter, *dots
      break
    end
  end
  content += "\n"

  content
end

#keep_row_if(&block) ⇒ Object



322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/daru/dataframe.rb', line 322

def keep_row_if &block
  deletion = []

  @index.each do |index|
    keep_row = yield access_row(index)

    deletion << index unless keep_row
  end
  deletion.each { |idx| 
    delete_row idx 
  }
end

#keep_vector_if(&block) ⇒ Object



335
336
337
338
339
340
341
# File 'lib/daru/dataframe.rb', line 335

def keep_vector_if &block
  @vectors.each do |vector|
    keep_vector = yield @data[@vectors[vector]], vector
    
    delete_vector vector unless keep_vector
  end
end

#map_rows(&block) ⇒ Object

Map each row



279
280
281
282
283
284
285
286
# File 'lib/daru/dataframe.rb', line 279

def map_rows(&block)
  df = self.dup
  df.each_row_with_index do |row, index|
    df[index, :row] = yield(row)
  end

  df
end

#map_rows_with_index(&block) ⇒ Object



288
289
290
291
292
293
294
295
# File 'lib/daru/dataframe.rb', line 288

def map_rows_with_index(&block)
  df = self.dup
  df.each_row_with_index do |row, index|
    df[index, :row] = yield(row, index)
  end

  df
end

#map_vectors(&block) ⇒ Object

Map each vector. Returns a DataFrame whose vectors are modified according to the value returned by the block.



260
261
262
263
264
265
266
267
# File 'lib/daru/dataframe.rb', line 260

def map_vectors(&block)
  df = self.dup
  df.each_vector_with_index do |vector, name|
    df[name, :vector] = yield(vector)
  end

  df
end

#map_vectors_with_index(&block) ⇒ Object



269
270
271
272
273
274
275
276
# File 'lib/daru/dataframe.rb', line 269

def map_vectors_with_index(&block)
  df = self.dup
  df.each_vector_with_index do |vector, name|
    df[name, :vector] = yield(vector, name)
  end

  df
end

#rowObject

Access a row or set/create a row. Refer #[] and #[]= docs for details.

Usage

df.row[:a] # access row named ':a'
df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]


211
212
213
# File 'lib/daru/dataframe.rb', line 211

def row
  Daru::Accessors::DataFrameByRow.new(self)
end

#tail(quantity = 10) ⇒ Object



379
380
381
# File 'lib/daru/dataframe.rb', line 379

def tail quantity=10
  self[(@size - quantity)..@size, :row]
end

#to_aObject

Converts the DataFrame into an array of hashes where key is vector name and value is the corresponding element. The 0th index of the array contains the array of hashes while the 1th index contains the indexes of each row of the dataframe. Each element in the index array corresponds to its row in the array of hashes, which has the same index.



397
398
399
400
401
402
403
404
405
# File 'lib/daru/dataframe.rb', line 397

def to_a
  arry = [[],[]]
  self.each_row do |row|
    arry[0] << row.to_hash
  end
  arry[1] = @index.to_a

  arry
end

#to_html(threshold = 30) ⇒ Object

Convert to html for IRuby.



416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# File 'lib/daru/dataframe.rb', line 416

def to_html threshold=30
  html  = '<table><tr><th></th>'
  @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
  html += '</tr>'

  @index.each_with_index do |index, num|
    html += '<tr>'
    html += '<td>' + index.to_s + '</td>'

    self.row[index].each do |element|
      html += '<td>' + element.to_s + '</td>'
    end

    html += '</tr>'
    if num > threshold
      html += '<tr>'
      (@vectors + 1).size.times { html += '<td>...</td>' }
      html += '</tr>'
      break
    end
  end
  html += '</table>'

  html
end

#to_json(no_index = true) ⇒ Object



407
408
409
410
411
412
413
# File 'lib/daru/dataframe.rb', line 407

def to_json no_index=true
  if no_index
    self.to_a[0].to_json
  else
    self.to_a.to_json
  end
end

#to_sObject



442
443
444
# File 'lib/daru/dataframe.rb', line 442

def to_s
  to_html
end

#vectorObject

Access a vector or set/create a vector. Refer #[] and #[]= docs for details.

Usage

df.vector[:a] # access vector named ':a'
df.vector[:b] = [1,2,3] # set vector ':b' to [1,2,3]


202
203
204
# File 'lib/daru/dataframe.rb', line 202

def vector
  Daru::Accessors::DataFrameByVector.new(self)
end