Class: Daru::DataFrame

Inherits:
Object
  • Object
show all
Includes:
Maths::Arithmetic::DataFrame, Maths::Statistics::DataFrame, Plotting::DataFrame
Defined in:
lib/daru/dataframe.rb,
lib/daru/extensions/rserve.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Plotting::DataFrame

#plot

Methods included from Maths::Statistics::DataFrame

#correlation, #count, #covariance, #describe, #max, #mean, #min, #product, #standardize, #std, #sum

Methods included from Maths::Arithmetic::DataFrame

#%, #*, #**, #+, #-, #/, #exp, #round, #sqrt

Constructor Details

#initialize(source, opts = {}) ⇒ DataFrame

DataFrame basically consists of an Array of Vector objects. These objects are indexed by row and column by vectors and index Index objects.

Arguments

  • source - Source from the DataFrame is to be initialized. Can be a Hash

of names and vectors (array or Daru::Vector), an array of arrays or array of Daru::Vectors.

Options

:order - An Array/Daru::Index/Daru::MultiIndex containing the order in which Vectors should appear in the DataFrame.

:index - An Array/Daru::Index/Daru::MultiIndex containing the order in which rows of the DataFrame will be named.

:name - A name for the DataFrame.

:clone - Specify as true or false. When set to false, and Vector objects are passed for the source, the Vector objects will not duplicated when creating the DataFrame. Will have no effect if Array is passed in the source, or if the passed Daru::Vectors have different indexes. Default to true.

Usage

df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a], 
  index: [:a, :b, :c, :d], name: :spider_man)

# => 
# <Daru::DataFrame:80766980 @name = spider_man @size = 4>
#             b          a 
#  a          6          1 
#  b          7          2 
#  c          8          3 
#  d          9          4


222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# File 'lib/daru/dataframe.rb', line 222

def initialize source, opts={}
  vectors = opts[:order]
  index   = opts[:index]
  clone   = opts[:clone] == false ? false : true
  @data   = []

  temp_name = opts[:name]
  @name   = temp_name.is_a?(Numeric) ? temp_name : (temp_name || SecureRandom.uuid).to_sym

  if source.empty?
    @vectors = create_index vectors
    @index   = create_index index
    create_empty_vectors
  else
    case source
    when Array
      if source.all? { |s| s.is_a?(Array) }
        raise ArgumentError, "Number of vectors (#{vectors.size}) should \
          equal order size (#{source.size})" if source.size != vectors.size

        @index   = create_index(index || source[0].size)
        @vectors = create_index(vectors)

        @vectors.each_with_index do |vec,idx|
          @data << Daru::Vector.new(source[idx], index: @index)
        end
      elsif source.all? { |s| s.is_a?(Daru::Vector) }
        hsh = {}
        vectors.each_with_index do |name, idx|
          hsh[name] = source[idx]
        end
        initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
      else # array of hashes
        if vectors.nil?
          @vectors = Daru::Index.new source[0].keys.map(&:to_sym)
        else
          @vectors = Daru::Index.new (vectors + (source[0].keys - vectors)).uniq.map(&:to_sym)
        end
        @index = Daru::Index.new(index || source.size)

        @vectors.each do |name|
          v = []
          source.each do |hsh|
            v << (hsh[name] || hsh[name.to_s])
          end

          @data << Daru::Vector.new(v, name: set_name(name), index: @index)
        end
      end
    when Hash
      create_vectors_index_with vectors, source
      if all_daru_vectors_in_source? source
        if !index.nil?
          @index = create_index index
        elsif all_vectors_have_equal_indexes?(source)
          @index = source.values[0].index.dup
        else
          all_indexes = []
          source.each_value do |vector|
            all_indexes << vector.index.to_a
          end
          # sort only if missing indexes detected
          all_indexes.flatten!.uniq!.sort!

          @index = Daru::Index.new all_indexes
          clone = true
        end

        if clone
          @vectors.each do |vector|
            @data << Daru::Vector.new([], name: vector, index: @index)

            @index.each do |idx|
              @data[@vectors[vector]][idx] = source[vector][idx]
            end
          end
        else
          @data.concat source.values
        end
      else
        @index = create_index(index || source.values[0].size)

        @vectors.each do |name|
          @data << Daru::Vector.new(source[name].dup, name: set_name(name), index: @index)
        end
      end
    end
  end

  set_size
  validate
  update
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args, &block) ⇒ Object



1848
1849
1850
1851
1852
1853
1854
1855
1856
# File 'lib/daru/dataframe.rb', line 1848

def method_missing(name, *args, &block)
  if md = name.match(/(.+)\=/)
    insert_or_modify_vector name[/(.+)\=/].delete("=").to_sym, args[0]
  elsif self.has_vector? name
    self[name, :vector]
  else
    super(name, *args, &block)
  end
end

Instance Attribute Details

#indexObject (readonly)

The index of the rows of the DataFrame



178
179
180
# File 'lib/daru/dataframe.rb', line 178

def index
  @index
end

#nameObject (readonly)

The name of the DataFrame



181
182
183
# File 'lib/daru/dataframe.rb', line 181

def name
  @name
end

#sizeObject (readonly)

The number of rows present in the DataFrame



184
185
186
# File 'lib/daru/dataframe.rb', line 184

def size
  @size
end

#vectorsObject (readonly)

The vectors (columns) index of the DataFrame



175
176
177
# File 'lib/daru/dataframe.rb', line 175

def vectors
  @vectors
end

Class Method Details

._load(data) ⇒ Object



1780
1781
1782
1783
1784
1785
1786
# File 'lib/daru/dataframe.rb', line 1780

def self._load data
  h = Marshal.load data
  Daru::DataFrame.new(h[:data], 
    index: h[:index], 
    order: h[:order],
    name:  h[:name])
end

.crosstab_by_assignation(rows, columns, values) ⇒ Object

Generates a new dataset, using three vectors

  • Rows

  • Columns

  • Values

For example, you have these values

x   y   v
a   a   0
a   b   1
b   a   1
b   b   0

You obtain

id  a   b
 a  0   1
 b  1   0

Useful to process outputs from databases



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/daru/dataframe.rb', line 140

def crosstab_by_assignation rows, columns, values
  raise "Three vectors should be equal size" if 
    rows.size != columns.size or rows.size!=values.size

  cols_values = columns.factors
  cols_n      = cols_values.size

  h_rows = rows.factors.inject({}) do |a,v| 
    a[v] = cols_values.inject({}) do |a1,v1| 
      a1[v1]=nil 
      a1
    end
    a
  end

  values.each_index do |i|
    h_rows[rows[i]][columns[i]] = values[i]
  end
  df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)

  rows.factors.each do |row|
    n_row = Array.new(cols_n+1)
    n_row[0] = row
    cols_values.each_index do |i|
      n_row[i+1] = h_rows[row][cols_values[i]]
    end

    df.add_row(n_row)
  end
  df.update
  df
end

.from_csv(path, opts = {}, &block) ⇒ Object

Load data from a CSV file. Specify an optional block to grab the CSV object and pre-condition it (for example use the ‘convert` or `header_convert` methods).

Arguments

  • path - Path of the file to load specified as a String.

Options

Accepts the same options as the Daru::DataFrame constructor and CSV.open() and uses those to eventually construct the resulting DataFrame.

Verbose Description

You can specify all the options to the ‘.from_csv` function that you do to the Ruby `CSV.read()` function, since this is what is used internally.

For example, if the columns in your CSV file are separated by something other that commas, you can use the ‘:col_sep` option. If you want to convert numeric values to numbers and not keep them as strings, you can use the `:converters` option and set it to `:numeric`.

The ‘.from_csv` function uses the following defaults for reading CSV files (that are passed into the `CSV.read()` function):

{
  :col_sep           => ',',
  :converters        => :numeric
}


48
49
50
# File 'lib/daru/dataframe.rb', line 48

def from_csv path, opts={}, &block
  Daru::IO.from_csv path, opts, &block      
end

.from_excel(path, opts = {}, &block) ⇒ Object

Read data from an Excel file into a DataFrame.

Arguments

  • path - Path of the file to be read.

Options

*:worksheet_id - ID of the worksheet that is to be read.



61
62
63
# File 'lib/daru/dataframe.rb', line 61

def from_excel path, opts={}, &block      
  Daru::IO.from_excel path, opts, &block
end

.from_plaintext(path, fields) ⇒ Object

Read the database from a plaintext file. For this method to work, the data should be present in a plain text file in columns. See spec/fixtures/bank2.dat for an example.

Arguments

  • path - Path of the file to be read.

  • fields - Vector names of the resulting database.

Usage

df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]


87
88
89
# File 'lib/daru/dataframe.rb', line 87

def from_plaintext path, fields
  Daru::IO.from_plaintext path, fields
end

.from_sql(dbh, query) ⇒ Object

Read a database query and returns a Dataset

USE:

dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")


71
72
73
# File 'lib/daru/dataframe.rb', line 71

def from_sql dbh, query
  Daru::IO.from_sql dbh, query
end

.rows(source, opts = {}) ⇒ Object

Create DataFrame by specifying rows as an Array of Arrays or Array of Daru::Vector objects.



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/daru/dataframe.rb', line 93

def rows source, opts={}
  df = nil
  if source.all? { |v| v.size == source[0].size }
    first = source[0]
    index = []
    opts[:order] ||=
    if first.is_a?(Daru::Vector) # assume that all are Vectors
      source.each { |vec| index << vec.name }
      first.index.to_a
    elsif first.is_a?(Array)
      Array.new(first.size) { |i| i.to_s }
    end

    if source.all? { |s| s.is_a?(Array) }
      df = Daru::DataFrame.new(source.transpose, opts)
    else # array of Daru::Vectors
      df = Daru::DataFrame.new({}, opts)
      source.each_with_index do |row, idx|
        df[(index[idx] || idx), :row] = row
      end
    end
  else
    raise SizeError, "All vectors must have same length"
  end

  df
end

Instance Method Details

#==(other) ⇒ Object



1843
1844
1845
1846
# File 'lib/daru/dataframe.rb', line 1843

def == other
  @index == other.index and @size == other.size and @vectors == other.vectors and 
  @vectors.all? { |vector| self[vector, :vector] == other[vector, :vector] }
end

#[](*names) ⇒ Object

Access row or vector. Specify name of row/vector followed by axis(:row, :vector). Defaults to :vector. Use of this method is not recommended for accessing rows or vectors. Use df.row for accessing row with index ‘:a’ or df.vector for accessing vector with index :vec.



320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# File 'lib/daru/dataframe.rb', line 320

def [](*names)
  if names[-1] == :vector or names[-1] == :row
    axis = names[-1]
    names = names[0..-2]
  else
    axis = :vector
  end
  names.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }

  if axis == :vector
    access_vector *names
  elsif axis == :row
    access_row *names
  else
    raise IndexError, "Expected axis to be row or vector not #{axis}"
  end
end

#[]=(*args) ⇒ Object

Insert a new row/vector of the specified name or modify a previous row. Instead of using this method directly, use df.row = [1,2,3] to set/create a row ‘:a’ to [1,2,3], or df.vector = [1,2,3] for vectors.

In case a Daru::Vector is specified after the equality the sign, the indexes of the vector will be matched against the row/vector indexes of the DataFrame before an insertion is performed. Unmatched indexes will be set to nil.



345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# File 'lib/daru/dataframe.rb', line 345

def []=(*args)
  axis = args.include?(:row) ? :row : :vector
  args.delete :vector
  args.delete :row

  name = args[0..-2]
  vector = args[-1]
  name.map! { |e| e.respond_to?(:to_sym) ? e.to_sym : e }

  if axis == :vector
    insert_or_modify_vector name, vector
  elsif axis == :row        
    insert_or_modify_row name, vector
  else
    raise IndexError, "Expected axis to be row or vector, not #{axis}."
  end
end

#_dump(depth) ⇒ Object



1771
1772
1773
1774
1775
1776
1777
1778
# File 'lib/daru/dataframe.rb', line 1771

def _dump depth
  Marshal.dump({
    data:  @data, 
    index: @index.to_a, 
    order: @vectors.to_a,
    name:  @name
    })
end

#add_row(row, index = nil) ⇒ Object



377
378
379
# File 'lib/daru/dataframe.rb', line 377

def add_row row, index=nil
  self.row[index || @size] = row
end

#add_vector(n, vector) ⇒ Object



381
382
383
# File 'lib/daru/dataframe.rb', line 381

def add_vector n, vector
  self[n] = vector
end

#add_vectors_by_split(name, join = '-', sep = Daru::SPLIT_TOKEN) ⇒ Object



1068
1069
1070
1071
# File 'lib/daru/dataframe.rb', line 1068

def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
  split = self[name].split_by_separator(sep)
  split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
end

#add_vectors_by_split_recode(name_, join = '-', sep = Daru::SPLIT_TOKEN) ⇒ Object



1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
# File 'lib/daru/dataframe.rb', line 1554

def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
  split = self[name_].split_by_separator(sep)
  i = 1
  split.each { |k,v|
    new_field = name_.to_s + join + i.to_s
    v.rename name_.to_s + ":" + k.to_s
    self[new_field.to_sym] = v
    i += 1
  }
end

#all?(axis = :vector, &block) ⇒ Boolean

Returns:

  • (Boolean)


1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
# File 'lib/daru/dataframe.rb', line 1106

def all? axis=:vector, &block
  if axis == :vector or axis == :column
    @data.all?(&block)
  elsif axis == :row
    each_row do |row|
      return false unless yield(row)
    end
    return true
  else
    raise ArgumentError, "Unidentified axis #{axis}"
  end
end

#any?(axis = :vector, &block) ⇒ Boolean

Returns:

  • (Boolean)


1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
# File 'lib/daru/dataframe.rb', line 1093

def any? axis=:vector, &block
  if axis == :vector or axis == :column
    @data.any?(&block)
  elsif axis == :row
    each_row do |row|
      return true if yield(row)
    end
    return false
  else
    raise ArgumentError, "Unidentified axis #{axis}"
  end
end

#bootstrap(n = nil) ⇒ Daru::DataFrame

Creates a DataFrame with the random data, of n size. If n not given, uses original number of rows.

Returns:



859
860
861
862
863
864
865
866
867
# File 'lib/daru/dataframe.rb', line 859

def bootstrap(n=nil)
  n ||= nrows
  ds_boot = Daru::DataFrame.new({}, order: @vectors)
  n.times do
    ds_boot.add_row(row[rand(n)])
  end
  ds_boot.update
  ds_boot
end

#clone(*vectors_to_clone) ⇒ Object

Returns a ‘view’ of the DataFrame, i.e the object ID’s of vectors are preserved.

Arguments

vectors_to_clone - Names of vectors to clone. Optional. Will return a view of the whole data frame otherwise.



435
436
437
438
439
440
441
442
443
444
# File 'lib/daru/dataframe.rb', line 435

def clone *vectors_to_clone
  vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
  return super if vectors_to_clone.empty?

  h = vectors_to_clone.inject({}) do |hsh, vec|
    hsh[vec] = self[vec]
    hsh
  end
  Daru::DataFrame.new(h, clone: false)
end

#clone_only_validObject

Returns a ‘shallow’ copy of DataFrame if missing data is not present, or a full copy of only valid data if missing data is present.



448
449
450
451
452
453
454
# File 'lib/daru/dataframe.rb', line 448

def clone_only_valid
  if has_missing_data?
    dup_only_valid
  else
    clone
  end
end

#clone_structureObject

Only clone the structure of the DataFrame.



424
425
426
# File 'lib/daru/dataframe.rb', line 424

def clone_structure
  Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
end

#collect(axis = :vector, &block) ⇒ Object

Iterate over a row or vector and return results in a Daru::Vector. Specify axis with :vector or :row. Default to :vector.

Description

The #collect iterator works similar to #map, the only difference being that it returns a Daru::Vector comprising of the results of each block run. The resultant Vector has the same index as that of the axis over which collect has iterated. It also accepts the optional axis argument.

Arguments

  • axis - The axis to iterate over. Can be :vector (or :column)

or :row. Default to :vector.



551
552
553
554
555
556
557
558
559
# File 'lib/daru/dataframe.rb', line 551

def collect axis=:vector, &block
  if axis == :vector or axis == :column
    collect_vectors(&block)
  elsif axis == :row
    collect_rows(&block)
  else
    raise ArgumentError, "Unknown axis #{axis}"
  end
end

#collect_matrix::Matrix

Generate a matrix, based on vector names of the DataFrame.

Returns:



813
814
815
816
817
818
819
820
821
822
823
824
# File 'lib/daru/dataframe.rb', line 813

def collect_matrix
  return to_enum(:collect_matrix) unless block_given?

  vecs = vectors.to_a
  rows = vecs.collect { |row|
    vecs.collect { |col|
      yield row,col
    }
  }

  Matrix.rows(rows)
end

#collect_row_with_index(&block) ⇒ Object



775
776
777
778
779
780
781
782
783
784
# File 'lib/daru/dataframe.rb', line 775

def collect_row_with_index &block
  return to_enum(:collect_row_with_index) unless block_given?

  data = []
  each_row_with_index do |row, i|
    data.push yield(row, i)
  end

  Daru::Vector.new(data, index: @index)
end

#collect_rows(&block) ⇒ Object

Retrieves a Daru::Vector, based on the result of calculation performed on each row.



764
765
766
767
768
769
770
771
772
773
# File 'lib/daru/dataframe.rb', line 764

def collect_rows &block
  return to_enum(:collect_rows) unless block_given?

  data = []
  each_row do |row|
    data.push yield(row)
  end

  Daru::Vector.new(data, index: @index)
end

#collect_vector_with_index(&block) ⇒ Object



799
800
801
802
803
804
805
806
807
808
# File 'lib/daru/dataframe.rb', line 799

def collect_vector_with_index &block
  return to_enum(:collect_vector_with_index) unless block_given?

  data = []
  each_vector_with_index do |vec, i|
    data.push yield(vec, i)
  end

  Daru::Vector.new(data, index: @vectors)
end

#collect_vectors(&block) ⇒ Object

Retrives a Daru::Vector, based on the result of calculation performed on each vector.



788
789
790
791
792
793
794
795
796
797
# File 'lib/daru/dataframe.rb', line 788

def collect_vectors &block
  return to_enum(:collect_vectors) unless block_given?

  data = []
  each_vector do |vec|
    data.push yield(vec)
  end

  Daru::Vector.new(data, index: @vectors)
end

#column(name) ⇒ Object

Access a vector by name.



373
374
375
# File 'lib/daru/dataframe.rb', line 373

def column name
  vector[name]
end

#compute(text, &block) ⇒ Object

Returns a vector, based on a string with a calculation based on vector.

The calculation will be eval’ed, so you can put any variable or expression valid on ruby.

For example:

a = Daru::Vector.new [1,2]
b = Daru::Vector.new [3,4]
ds = Daru::DataFrame.new({:a => a,:b => b})
ds.compute("a+b")
=> Vector [4,6]


1000
1001
1002
1003
# File 'lib/daru/dataframe.rb', line 1000

def compute text, &block
  return instance_eval(&block) if block_given?
  instance_eval(text) 
end

#create_sql(table, charset = "UTF8") ⇒ Object

Create a sql, basen on a given Dataset

Arguments

  • table - String specifying name of the table that will created in SQL.

  • charset - Character set. Default is “UTF8”.

Usage

ds = Daru::DataFrame.new({
 :id   => Daru::Vector.new([1,2,3,4,5]),
 :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
})
ds.create_sql('names')
 ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"


1581
1582
1583
1584
1585
1586
1587
1588
1589
# File 'lib/daru/dataframe.rb', line 1581

def create_sql(table,charset="UTF8")
  sql    = "CREATE TABLE #{table} ("
  fields = self.vectors.to_a.collect do |f|
    v = self[f]
    f.to_s + " " + v.db_type
  end

  sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
end

#delete_row(index) ⇒ Object

Delete a row



840
841
842
843
844
845
846
847
848
849
850
851
852
853
# File 'lib/daru/dataframe.rb', line 840

def delete_row index
  idx = named_index_for index

  if @index.include? idx
    @index = reassign_index_as(@index.to_a - [idx])
    self.each_vector do |vector|
      vector.delete_at idx
    end
  else
    raise IndexError, "Index #{index} does not exist."
  end

  set_size
end

#delete_vector(vector) ⇒ Object

Delete a vector



828
829
830
831
832
833
834
835
836
837
# File 'lib/daru/dataframe.rb', line 828

def delete_vector vector
  if @vectors.include? vector
    @data.delete_at @vectors[vector]
    @vectors = Daru::Index.new @vectors.to_a - [vector]
  else
    raise IndexError, "Vector #{vector} does not exist."
  end

  self
end

#dup(vectors_to_dup = nil) ⇒ Object

Duplicate the DataFrame entirely.

Arguments

  • vectors_to_dup - An Array specifying the names of Vectors to

be duplicated. Will duplicate the entire DataFrame if not specified.



400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
# File 'lib/daru/dataframe.rb', line 400

def dup vectors_to_dup=nil
  vectors_to_dup = @vectors unless vectors_to_dup

  new_order =
  if vectors.is_a?(MultiIndex)
    src = []
    vectors_to_dup.each do |vec|
      src << @data[@vectors[vec]].dup
    end

    Daru::MultiIndex.new(vectors_to_dup)
  else
    src = {}
    vectors_to_dup.each do |vector|
      src[vector] = @data[@vectors[vector]].dup
    end

    Daru::Index.new(vectors_to_dup)
  end

  Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
end

#dup_only_valid(vecs = nil) ⇒ Object

Creates a new duplicate dataframe containing only rows without a single missing value.



458
459
460
461
462
463
464
465
466
# File 'lib/daru/dataframe.rb', line 458

def dup_only_valid vecs=nil
  rows_with_nil = @data.inject([]) do |memo, vector|
    memo.concat vector.missing_positions
    memo
  end.uniq

  row_indexes = @index.to_a
  (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
end

#each(axis = :vector, &block) ⇒ Object

Iterate over each row or vector of the DataFrame. Specify axis by passing :vector or :row as the argument. Default to :vector.

Description

‘#each` works exactly like Array#each. The default mode for `each` is to iterate over the columns of the DataFrame. To iterate over rows you must pass the axis, i.e `:row` as an argument.

Arguments

  • axis - The axis to iterate over. Can be :vector (or :column)

or :row. Default to :vector.



526
527
528
529
530
531
532
533
534
# File 'lib/daru/dataframe.rb', line 526

def each axis=:vector, &block
  if axis == :vector or axis == :column
    each_vector(&block)
  elsif axis == :row
    each_row(&block)
  else
    raise ArgumentError, "Unknown axis #{axis}"
  end
end

#each_row(&block) ⇒ Object

Iterate over each row



493
494
495
496
497
498
499
500
501
# File 'lib/daru/dataframe.rb', line 493

def each_row(&block)
  return to_enum(:each_row) unless block_given?

  @index.each do |index|
    yield access_row(index)
  end

  self
end

#each_row_with_index(&block) ⇒ Object



503
504
505
506
507
508
509
510
511
# File 'lib/daru/dataframe.rb', line 503

def each_row_with_index(&block)
  return to_enum(:each_row_with_index) unless block_given?

  @index.each do |index|
    yield access_row(index), index
  end

  self
end

#each_vector(&block) ⇒ Object Also known as: each_column

Iterate over each vector



469
470
471
472
473
474
475
# File 'lib/daru/dataframe.rb', line 469

def each_vector(&block)
  return to_enum(:each_vector) unless block_given?

  @data.each(&block)

  self
end

#each_vector_with_index(&block) ⇒ Object Also known as: each_column_with_index

Iterate over each vector alongwith the name of the vector



480
481
482
483
484
485
486
487
488
# File 'lib/daru/dataframe.rb', line 480

def each_vector_with_index(&block)
  return to_enum(:each_vector_with_index) unless block_given?

  @vectors.each do |vector|
    yield @data[@vectors[vector]], vector
  end 

  self
end

#filter(axis = :vector, &block) ⇒ Object

Retain vectors or rows if the block returns a truthy value.

Description

For filtering out certain rows/vectors based on their values, use the #filter method. By default it iterates over vectors and keeps those vectors for which the block returns true. It accepts an optional axis argument which lets you specify whether you want to iterate over vectors or rows.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.

Usage

# Filter vectors

df.filter do |vector|
  vector.type == :numeric and vector.median < 50
end

# Filter rows

df.filter(:row) do |row|
  row[:a] + row[:d] < 100
end


656
657
658
659
660
661
662
# File 'lib/daru/dataframe.rb', line 656

def filter axis=:vector, &block
  if axis == :vector or axis == :column
    filter_vectors(&block)
  elsif axis == :row
    filter_rows(&block)
  end
end

#filter_rows(&block) ⇒ Object

Iterates over each row and retains it in a new DataFrame if the block returns true for that row.



902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
# File 'lib/daru/dataframe.rb', line 902

def filter_rows &block
  return to_enum(:filter_rows) unless block_given?

  df = Daru::DataFrame.new({}, order: @vectors.to_a)
  marked = []

  @index.each do |index|
    keep_row = yield access_row(index)
    marked << index if keep_row
  end

  marked.each do |idx|
    df.row[idx] = self[idx, :row]
  end

  df
end

#filter_vector(vec) ⇒ Object

creates a new vector with the data of a given field which the block returns true



891
892
893
894
895
896
897
898
# File 'lib/daru/dataframe.rb', line 891

def filter_vector vec
  d = []
  each_row do |row|
    d.push(row[vec]) if yield row
  end

  Daru::Vector.new(d)
end

#filter_vectors(&block) ⇒ Object

Iterates over each vector and retains it in a new DataFrame if the block returns true for that vector.



922
923
924
925
926
927
928
929
# File 'lib/daru/dataframe.rb', line 922

def filter_vectors &block
  return to_enum(:filter_vectors) unless block_given?
  
  df = self.dup
  df.keep_vector_if &block

  df
end

#group_by(vectors) ⇒ Object

Group elements by vector to perform operations on them. Returns a Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed list of possible operations.

Arguments

  • vectors - An Array contatining names of vectors to group by.

Usage

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a,:b,:c]).groups
#=> {["bar", "one", 2]=>[1],
# ["bar", "three", 1]=>[3],
# ["bar", "two", 6]=>[5],
# ["foo", "one", 1]=>[0],
# ["foo", "one", 3]=>[6],
# ["foo", "three", 8]=>[7],
# ["foo", "two", 3]=>[2, 4]}


1186
1187
1188
1189
1190
1191
1192
# File 'lib/daru/dataframe.rb', line 1186

def group_by vectors
  vectors = [vectors] if vectors.is_a?(Symbol)
  vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
    has_vector?(v) }
    
  Daru::Core::GroupBy.new(self, vectors)
end

#has_missing_data?Boolean Also known as: flawed?

Returns:

  • (Boolean)


1024
1025
1026
# File 'lib/daru/dataframe.rb', line 1024

def has_missing_data?
  !!@data.any? { |v| v.has_missing_data? }
end

#has_vector?(vector) ⇒ Boolean

Check if a vector is present

Returns:

  • (Boolean)


1089
1090
1091
# File 'lib/daru/dataframe.rb', line 1089

def has_vector? vector
  !!@vectors[*vector]
end

#head(quantity = 10) ⇒ Object

The first ten elements of the DataFrame

Parameters:

  • quantity (Fixnum) (defaults to: 10)

    (10) The number of elements to display from the top.



1122
1123
1124
# File 'lib/daru/dataframe.rb', line 1122

def head quantity=10
  self[0..(quantity-1), :row]
end

#inspect(spacing = 10, threshold = 15) ⇒ Object

Pretty print in a nice table format for the command line (irb/pry/iruby)



1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
# File 'lib/daru/dataframe.rb', line 1810

def inspect spacing=10, threshold=15
  longest = [@name.to_s.size,
             (@vectors.map(&:to_s).map(&:size).max || 0), 
             (@index  .map(&:to_s).map(&:size).max || 0),
             (@data   .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max

  name      = @name || 'nil'
  content   = ""
  longest   = spacing if longest > spacing
  formatter = "\n"

  (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
  content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " + 
                name.to_s + " @size = " + @size.to_s + ">"
  content += sprintf formatter, "" , *@vectors.map(&:to_s)
  row_num  = 1

  self.each_row_with_index do |row, index|
    content += sprintf formatter, index.to_s, *row.to_hash.values.map { |e| (e || 'nil').to_s }
    row_num += 1
    if row_num > threshold
      dots = []

      (@vectors.size + 1).times { dots << "..." }
      content += sprintf formatter, *dots
      break
    end
  end
  content += "\n"

  content
end

#join(other_ds, fields_1 = [], fields_2 = [], type = :left) ⇒ Daru::DataFrame

Join 2 DataFrames by given fields type is one of :left and :inner, default is :left

Untested! Use at your own risk.

Returns:



1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
# File 'lib/daru/dataframe.rb', line 1439

def join(other_ds,fields_1=[],fields_2=[],type=:left)
  fields_new = other_ds.vectors.to_a - fields_2
  fields     =     self.vectors.to_a + fields_new

  other_ds_hash = {}
  other_ds.each_row do |row|
    key = row.to_hash.select { |k,v| fields_2.include?(k) }.values
    value = row.to_hash.select { |k,v| fields_new.include?(k) }

    if other_ds_hash[key].nil?
      other_ds_hash[key] = [value]
    else
      other_ds_hash[key] << value
    end
  end

  new_ds = DataFrame.new({}, order: fields)

  self.each_row do |row|
    key = row.to_hash.select{|k,v| fields_1.include?(k)}.values
    new_case = row.to_hash

    if other_ds_hash[key].nil?
      if type == :left
        fields_new.each{|field| new_case[field] = nil}
        new_ds.add_row(Daru::Vector.new(new_case))
      end
    else
      other_ds_hash[key].each do |new_values|
        new_ds.add_row(Daru::Vector.new(new_case.merge(new_values)))
      end
    end
  end

  new_ds
end

#keep_row_if(&block) ⇒ Object



869
870
871
872
873
874
875
876
877
878
879
880
# File 'lib/daru/dataframe.rb', line 869

def keep_row_if &block
  deletion = []

  @index.each do |index|
    keep_row = yield access_row(index)

    deletion << index unless keep_row
  end
  deletion.each { |idx| 
    delete_row idx 
  }
end

#keep_vector_if(&block) ⇒ Object



882
883
884
885
886
887
888
# File 'lib/daru/dataframe.rb', line 882

def keep_vector_if &block
  @vectors.each do |vector|
    keep_vector = yield @data[@vectors[vector]], vector
    
    delete_vector vector unless keep_vector
  end
end

#map(axis = :vector, &block) ⇒ Object

Map over each vector or row of the data frame according to the argument specified. Will return an Array of the resulting elements. To map over each row/vector and get a DataFrame, see #recode.

Description

The #map iterator works like Array#map. The value returned by each run of the block is added to an Array and the Array is returned. This method also accepts an axis argument, like #each. The default is :vector.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



577
578
579
580
581
582
583
584
585
# File 'lib/daru/dataframe.rb', line 577

def map axis=:vector, &block
  if axis == :vector or axis == :column
    map_vectors(&block)
  elsif axis == :row
    map_rows(&block)
  else
    raise ArgumentError, "Unknown axis #{axis}"
  end
end

#map!(axis = :vector, &block) ⇒ Object

Destructive map. Modifies the DataFrame. Each run of the block must return a Daru::Vector. You can specify the axis to map over as the argument. Default to :vector.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



595
596
597
598
599
600
601
# File 'lib/daru/dataframe.rb', line 595

def map! axis=:vector, &block
  if axis == :vector or axis == :column
    map_vectors!(&block)
  elsif axis == :row
    map_rows!(&block)
  end
end

#map_rows(&block) ⇒ Object

Map each row



728
729
730
731
732
733
734
735
736
737
# File 'lib/daru/dataframe.rb', line 728

def map_rows(&block)
  return to_enum(:map_rows) unless block_given?

  dt = []
  each_row do |row|
    dt << yield(row)
  end

  dt
end

#map_rows!(&block) ⇒ Object



750
751
752
753
754
755
756
757
758
759
760
# File 'lib/daru/dataframe.rb', line 750

def map_rows!(&block)
  return to_enum(:map_rows!) unless block_given?

  index.dup.each do |i|
    r = yield self.row[i]
    r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
    self.row[i] = r
  end

  self
end

#map_rows_with_index(&block) ⇒ Object



739
740
741
742
743
744
745
746
747
748
# File 'lib/daru/dataframe.rb', line 739

def map_rows_with_index(&block)
  return to_enum(:map_rows_with_index) unless block_given?

  dt = []
  each_row_with_index do |row, index|
    dt << yield(row, index)
  end

  dt
end

#map_vectors(&block) ⇒ Object

Map each vector and return an Array.



691
692
693
694
695
696
697
698
699
700
# File 'lib/daru/dataframe.rb', line 691

def map_vectors(&block)
  return to_enum(:map_vectors) unless block_given?

  arry = []
  @data.each do |vec|
    arry << yield(vec)
  end

  arry
end

#map_vectors!(&block) ⇒ Object

Destructive form of #map_vectors



703
704
705
706
707
708
709
710
711
712
713
# File 'lib/daru/dataframe.rb', line 703

def map_vectors!(&block)
  return to_enum(:map_vectors!) unless block_given?

  vectors.dup.each do |n|
    v = yield self[n]
    v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
    self[n] = v
  end

  self
end

#map_vectors_with_index(&block) ⇒ Object

Map vectors alongwith the index.



716
717
718
719
720
721
722
723
724
725
# File 'lib/daru/dataframe.rb', line 716

def map_vectors_with_index(&block)
  return to_enum(:map_vectors_with_index) unless block_given?

  dt = []
  each_vector_with_index do |vector, name|
    dt << yield(vector, name)
  end

  dt
end

#merge(other_df) ⇒ Daru::DataFrame

Merge vectors from two DataFrames. In case of name collision, the vectors names are changed to x_1, x_2 .…

Returns:



1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
# File 'lib/daru/dataframe.rb', line 1416

def merge other_df
  raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows

  new_fields = (@vectors.to_a + other_df.vectors.to_a)
                    .recode_repeated
                    .map(&:to_sym)
  df_new     = DataFrame.new({}, order: new_fields)

  (0...nrows).to_a.each do |i|
    row = self.row[i].to_a + other_df.row[i].to_a
    df_new.add_row(row)
  end

  df_new.update
  df_new
end

#missing_values_rows(missing_values = [nil]) ⇒ Object Also known as: vector_missing_values

Return a vector with the number of missing values in each row.

Arguments

  • missing_values - An Array of the values that should be

treated as ‘missing’. The default missing value is nil.



1011
1012
1013
1014
1015
1016
1017
1018
1019
# File 'lib/daru/dataframe.rb', line 1011

def missing_values_rows missing_values=[nil]
  number_of_missing = []
  each_row do |row|
    row.missing_values = missing_values
    number_of_missing << row.missing_positions.size
  end

  Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows".to_sym
end

#ncolsObject

The number of vectors



1084
1085
1086
# File 'lib/daru/dataframe.rb', line 1084

def ncols
  shape[1]
end

#nest(*tree_keys, &block) ⇒ Object

Return a nested hash using vector names as keys and an array constructed of hashes with other values. If block provided, is used to provide the values, with parameters row of dataset, current last hash on hierarchy and name of the key to include



1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
# File 'lib/daru/dataframe.rb', line 1034

def nest *tree_keys, &block
  tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
  out = {}

  each_row do |row|
    current = out
    # Create tree
    tree_keys[0, tree_keys.size-1].each do |f|
      root = row[f]
      current[root] ||= {}
      current = current[root]
    end
    name = row[tree_keys.last]
    if !block
      current[name] ||= []
      current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key})
    else
      current[name] = block.call(row, current,name)
    end
  end

  out
end

#nrowsObject

The number of rows



1079
1080
1081
# File 'lib/daru/dataframe.rb', line 1079

def nrows
  shape[0]
end

#numeric_vector_namesObject



1231
1232
1233
1234
1235
1236
1237
1238
# File 'lib/daru/dataframe.rb', line 1231

def numeric_vector_names
  numerics = []

  each_vector do |vec, i|
    numerics << vec.name if(vec.type == :numeric)
  end
  numerics
end

#numeric_vectorsObject

Return the indexes of all the numeric vectors. Will include vectors with nils alongwith numbers.



1222
1223
1224
1225
1226
1227
1228
1229
# File 'lib/daru/dataframe.rb', line 1222

def numeric_vectors
  numerics = []

  each_vector_with_index do |vec, i|
    numerics << i if(vec.type == :numeric)
  end
  numerics
end

#one_to_many(parent_fields, pattern) ⇒ Object

Creates a new dataset for one to many relations on a dataset, based on pattern of field names.

for example, you have a survey for number of children with this structure:

id, name, child_name_1, child_age_1, child_name_2, child_age_2

with

ds.one_to_many([:id], "child_%v_%n"

the field of first parameters will be copied verbatim to new dataset, and fields which responds to second pattern will be added one case for each different %n.

Usage

cases=[
  ['1','george','red',10,'blue',20,nil,nil],
  ['2','fred','green',15,'orange',30,'white',20],
  ['3','alfred',nil,nil,nil,nil,nil,nil]
]
ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
ds.one_to_many([:id],'car_%v%n').to_matrix
=> Matrix[
   ["red", "1", 10],
   ["blue", "1", 20],
   ["green", "2", 15],
   ["orange", "2", 30],
   ["white", "2", 20]
   ]


1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
# File 'lib/daru/dataframe.rb', line 1505

def one_to_many(parent_fields, pattern)
  re      = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
  ds_vars = parent_fields
  vars    = []
  max_n   = 0
  h       = parent_fields.inject({}) { |a,v| 
    a[v] = Daru::Vector.new([])
    a 
  }
  # Adding _row_id
  h[:_col_id] = Daru::Vector.new([])
  ds_vars.push(:_col_id)

  @vectors.each do |f|
    if f =~ re
      if !vars.include? $1
        vars.push($1)
        h[$1] = Daru::Vector.new([])
      end
      max_n = $2.to_i if max_n < $2.to_i
    end
  end
  ds = DataFrame.new(h, order: ds_vars+vars)

  each_row do |row|
    row_out = {}
    parent_fields.each do |f|
      row_out[f]=row[f]
    end

    max_n.times do |n1|
      n  = n1+1
      any_data = false
      vars.each do |v|
        data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s).to_sym]
        row_out[v] = data
        any_data = true if !data.nil?
      end

      if any_data
        row_out[:_col_id] = n
        ds.add_row(row_out)
      end
    end
  end
  ds.update
  ds
end

#only_numerics(opts = {}) ⇒ Object

Return a DataFrame of only the numerical Vectors. If clone: false is specified as option, only a view of the Vectors will be returned. Defaults to clone: true.



1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
# File 'lib/daru/dataframe.rb', line 1243

def only_numerics opts={}
  cln = opts[:clone] == false ? false : true
  nv = numeric_vectors
  arry = nv.inject([]) do |arr, v|
    arr << self[v]
    arr
  end

  order = @vectors.is_a?(MultiIndex) ? MultiIndex.new(nv) : Index.new(nv)
  Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
end

#pivot_table(opts = {}) ⇒ Object

Pivots a data frame on specified vectors and applies an aggregate function to quickly generate a summary.

Options

:index - Keys to group by on the pivot table row index. Pass vector names contained in an Array.

:vectors - Keys to group by on the pivot table column index. Pass vector names contained in an Array.

:agg - Function to aggregate the grouped values. Default to :mean. Can use any of the statistics functions applicable on Vectors that can be found in the Daru::Statistics::Vector module.

:values - Columns to aggregate. Will consider all numeric columns not specified in :index or :vectors. Optional.

Usage

df = Daru::DataFrame.new({
  a: ['foo'  ,  'foo',  'foo',  'foo',  'foo',  'bar',  'bar',  'bar',  'bar'], 
  b: ['one'  ,  'one',  'one',  'two',  'two',  'one',  'one',  'two',  'two'],
  c: ['small','large','large','small','small','large','small','large','small'],
  d: [1,2,2,3,3,4,5,6,7],
  e: [2,4,4,6,6,8,10,12,14]
})
df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)

#=> 
# #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
#            [:e, :one] [:e, :two] 
#     [:bar]         18         26 
#     [:foo]         10         12

Raises:

  • (ArgumentError)


1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
# File 'lib/daru/dataframe.rb', line 1349

def pivot_table opts={}
  raise ArgumentError, "Specify grouping index" if !opts[:index] or opts[:index].empty?

  index   = opts[:index]
  vectors = opts[:vectors] || []
  aggregate_function = opts[:agg] || :mean
  values = 
  if opts[:values].is_a?(Symbol)
    [opts[:values]]
  elsif opts[:values].is_a?(Array)
    opts[:values]
  else # nil
    (@vectors.to_a - (index | vectors)) & numeric_vector_names
  end
  
  raise IndexError, "No numeric vectors to aggregate" if values.empty?

  grouped  = group_by(index)

  unless vectors.empty?
    super_hash = {}
    values.each do |value|
      grouped.groups.each do |group_name, row_numbers|
        super_hash[group_name] ||= {}

        row_numbers.each do |num|
          arry = []
          arry << value
          vectors.each { |v| arry << self[v][num] }
          sub_hash = super_hash[group_name]
          sub_hash[arry] ||= []

          sub_hash[arry] << self[value][num]
        end
      end
    end

    super_hash.each_value do |sub_hash|
      sub_hash.each do |group_name, aggregates|
        sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
      end
    end

    df_index = Daru::MultiIndex.new(symbolize(super_hash.keys))

    vector_indexes = []
    super_hash.each_value do |sub_hash|
      vector_indexes.concat sub_hash.keys
    end
    df_vectors = Daru::MultiIndex.new symbolize(vector_indexes.uniq)
    pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)

    super_hash.each do |row_index, sub_h|
      sub_h.each do |vector_index, val|
        pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
      end
    end
    return pivoted_dataframe
  else
    grouped.send(aggregate_function)
  end
end

#recast(opts = {}) ⇒ Object

Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype

Usage

df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
df.recast a: :nmatrix, c: :nmatrix


1793
1794
1795
1796
1797
# File 'lib/daru/dataframe.rb', line 1793

def recast opts={}
  opts.each do |vector_name, dtype|
    vector[vector_name].cast(dtype: dtype)
  end
end

#recode(axis = :vector, &block) ⇒ Object

Maps over the DataFrame and returns a DataFrame. Each run of the block must return a Daru::Vector object. You can specify the axis to map over. Default to :vector.

Description

Recode works similarly to #map, but an important difference between the two is that recode returns a modified Daru::DataFrame instead of an Array. For this reason, #recodeexpects that every run of the block to return a Daru::Vector.

Just like map and each, recode also accepts an optional axis argument.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



620
621
622
623
624
625
626
# File 'lib/daru/dataframe.rb', line 620

def recode axis=:vector, &block
  if axis == :vector or axis == :column
    recode_vectors(&block)
  elsif axis == :row
    recode_rows(&block)
  end
end

#recode_rows(&block) ⇒ Object



677
678
679
680
681
682
683
684
685
686
687
688
# File 'lib/daru/dataframe.rb', line 677

def recode_rows &block
  block_given? or return to_enum(:recode_rows)

  df = self.dup
  df.each_row_with_index do |r, i|
    ret = yield r
    ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
    df.row[i] = ret
  end

  df
end

#recode_vectors(&block) ⇒ Object



664
665
666
667
668
669
670
671
672
673
674
675
# File 'lib/daru/dataframe.rb', line 664

def recode_vectors &block
  block_given? or return to_enum(:recode_vectors) 

  df = self.dup
  df.each_vector_with_index do |v, i|
    ret = yield v
    ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
    df[*i] = ret
  end

  df
end

#reindex(new_index) ⇒ Object

Non-destructive version of #reindex!



1216
1217
1218
# File 'lib/daru/dataframe.rb', line 1216

def reindex new_index
  self.dup.reindex! new_index
end

#reindex!(new_index) ⇒ Object

Change the index of the DataFrame and its underlying vectors. Destructive.

Parameters:

  • new_index (Symbol, Array)

    Specify an Array if

Raises:

  • (ArgumentError)


1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
# File 'lib/daru/dataframe.rb', line 1204

def reindex! new_index
  raise ArgumentError, "Index size must equal dataframe size" if new_index.is_a?(Array) and new_index.size != @size

  @index = possibly_multi_index?(new_index == :seq ? @size : new_index)
  @data.map! do |vector|
    vector.reindex possibly_multi_index?(@index.to_a)
  end

  self
end

#reindex_vectors!(new_vectors) ⇒ Object

Raises:

  • (ArgumentError)


1194
1195
1196
1197
1198
1199
# File 'lib/daru/dataframe.rb', line 1194

def reindex_vectors! new_vectors
  raise ArgumentError, "Number of vectors passed into function (#{new_vectors.size}) should equal that present in the DataFrame (#{@vectors.size})" if 
    @vectors.size != new_vectors.size

  @vectors = Daru::Index.new new_vectors.map(&:to_sym), new_vectors.map { |e| @vectors[e] }
end

#rename(new_name) ⇒ Object



1716
1717
1718
1719
1720
1721
1722
# File 'lib/daru/dataframe.rb', line 1716

def rename new_name
  if new_name.is_a?(Numeric)
    @name = new_name 
    return
  end
  @name = new_name.to_sym
end

#report_building(b) ⇒ Object

:nodoc: #



1260
1261
1262
1263
1264
1265
1266
1267
1268
# File 'lib/daru/dataframe.rb', line 1260

def report_building(b) # :nodoc: #
  b.section(:name=>@name) do |g|
    g.text "Number of rows: #{nrows}"
    @vectors.each do |v|
      g.text "Element:[#{v}]"
      g.parse_element(self[v])
    end
  end
end

#rowObject

Access a row or set/create a row. Refer #[] and #[]= docs for details.

Usage

df.row[:a] # access row named ':a'
df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]


390
391
392
# File 'lib/daru/dataframe.rb', line 390

def row
  Daru::Accessors::DataFrameByRow.new(self)
end

#save(filename) ⇒ Object

Use marshalling to save dataframe to a file.



1767
1768
1769
# File 'lib/daru/dataframe.rb', line 1767

def save filename
  Daru::IO.save self, filename
end

#shapeObject

Return the number of rows and columns of the DataFrame in an Array.



1074
1075
1076
# File 'lib/daru/dataframe.rb', line 1074

def shape
  [@index.size, @vectors.size]
end

#sort(vector_order, opts = {}) ⇒ Object

Non-destructive version of #sort!



1311
1312
1313
# File 'lib/daru/dataframe.rb', line 1311

def sort vector_order, opts={}
  self.dup.sort! vector_order, opts
end

#sort!(vector_order, opts = {}) ⇒ Object

Sorts a dataframe (ascending/descending)according to the given sequence of vectors, using the attributes provided in the blocks.

Usage

df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})

#<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
#            a          b 
# 0         -3          4 
# 1          2          3 
# 2         -1          2 
# 3          4          1 
df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })

Parameters:

  • order (Array)

    The order of vector names in which the DataFrame should be sorted.

  • opts (Hash) (defaults to: {})

    The options to sort with.

Options Hash (opts):

  • :ascending (TrueClass, FalseClass, Array) — default: true

    Sort in ascending or descending order. Specify Array corresponding to order for multiple sort orders.

  • :by (Hash) — default: {|a, b| a <=> b}

    Specify attributes of objects to to be used for sorting, for each vector name in order as a hash of vector name and lambda pairs. In case a lambda for a vector is not specified, the default will be used.

Raises:

  • (ArgumentError)


1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
# File 'lib/daru/dataframe.rb', line 1295

def sort! vector_order, opts={}
  raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
  opts = {
    ascending: true,
    type: :quick_sort,
    by: {}
  }.merge(opts)

  opts[:by]        = create_logic_blocks vector_order, opts[:by]
  opts[:ascending] = sort_order_array vector_order, opts[:ascending]
  index = @index.to_a
  send(opts[:type], vector_order, index, opts[:by], opts[:ascending])
  reindex! index
end

#summary(method = :to_text) ⇒ Object

Generate a summary of this DataFrame with ReportBuilder.



1256
1257
1258
# File 'lib/daru/dataframe.rb', line 1256

def summary(method = :to_text)
  ReportBuilder.new(no_title: true).add(self).send(method)
end

#tail(quantity = 10) ⇒ Object

The last ten elements of the DataFrame

Parameters:

  • quantity (Fixnum) (defaults to: 10)

    (10) The number of elements to display from the bottom.



1129
1130
1131
# File 'lib/daru/dataframe.rb', line 1129

def tail quantity=10
  self[(@size - quantity)..(@size-1), :row]
end

#to_aObject

Converts the DataFrame into an array of hashes where key is vector name and value is the corresponding element. The 0th index of the array contains the array of hashes while the 1th index contains the indexes of each row of the dataframe. Each element in the index array corresponds to its row in the array of hashes, which has the same index.



1632
1633
1634
1635
1636
1637
1638
1639
1640
# File 'lib/daru/dataframe.rb', line 1632

def to_a
  arry = [[],[]]
  self.each_row do |row|
    arry[0] << row.to_hash
  end
  arry[1] = @index.to_a

  arry
end

#to_gslObject

Convert all numeric vectors to GSL::Matrix



1592
1593
1594
1595
1596
1597
1598
1599
# File 'lib/daru/dataframe.rb', line 1592

def to_gsl
  numerics_as_arrays = []
  numeric_vectors.each do |n|
    numerics_as_arrays << self[n].to_a
  end

  GSL::Matrix.alloc *numerics_as_arrays.transpose
end

#to_hashObject

Converts DataFrame to a hash with keys as vector names and values as the corresponding vectors.



1652
1653
1654
1655
1656
1657
1658
1659
# File 'lib/daru/dataframe.rb', line 1652

def to_hash
  hsh = {}
  @vectors.each_with_index do |vec_name, idx|
    hsh[vec_name] = @data[idx]
  end

  hsh
end

#to_html(threshold = 30) ⇒ Object

Convert to html for IRuby.



1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
# File 'lib/daru/dataframe.rb', line 1662

def to_html threshold=30
  html = "<table>" + 
    "<tr>" +
      "<th colspan=\"#{@vectors.size+1}\">" + 
        "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}" 
      "</th>" +
    "</tr>"
  html +='<tr><th></th>'
  @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
  html += '</tr>'

  @index.each_with_index do |index, num|
    html += '<tr>'
    html += '<td>' + index.to_s + '</td>'

    self.row[index].each do |element|
      html += '<td>' + element.to_s + '</td>'
    end

    html += '</tr>'
    if num > threshold
      html += '<tr>'
      (@vectors + 1).size.times { html += '<td>...</td>' }
      html += '</tr>'

      last_index = @index.to_a.last
      last_row = self.row[last_index]
      html += '<tr>'
      html += "<td>" + last_index.to_s + "</td>"
      (0..(ncols - 1)).to_a.each do |i|
        html += '<td>' + last_row[i].to_s + '</td>' 
      end
      html += '</tr>'
      break
    end
  end
  html += '</table>'

  html
end

#to_json(no_index = true) ⇒ Object



1642
1643
1644
1645
1646
1647
1648
# File 'lib/daru/dataframe.rb', line 1642

def to_json no_index=true
  if no_index
    self.to_a[0].to_json
  else
    self.to_a.to_json
  end
end

#to_matrixObject

Convert all vectors of type :numeric into a Matrix.



1602
1603
1604
1605
1606
1607
1608
1609
# File 'lib/daru/dataframe.rb', line 1602

def to_matrix
  numerics_as_arrays = []
  each_vector do |vector|
    numerics_as_arrays << vector.to_a if(vector.type == :numeric)
  end

  Matrix.columns numerics_as_arrays
end

#to_nmatrixObject

Convert all vectors of type :numeric and not containing nils into an NMatrix.



1617
1618
1619
1620
1621
1622
1623
1624
1625
# File 'lib/daru/dataframe.rb', line 1617

def to_nmatrix
  numerics_as_arrays = []
  each_vector do |vector|
    numerics_as_arrays << vector.to_a if(vector.type == :numeric and 
      vector.missing_positions.size == 0)
  end

  numerics_as_arrays.transpose.to_nm
end

#to_nyaplotdfObject

Return a Nyaplot::DataFrame from the data of this DataFrame.



1612
1613
1614
# File 'lib/daru/dataframe.rb', line 1612

def to_nyaplotdf
  Nyaplot::DataFrame.new(to_a[0])
end

#to_REXPObject



5
6
7
8
9
10
11
12
13
# File 'lib/daru/extensions/rserve.rb', line 5

def to_REXP
  names = @vectors.to_a
  data  = names.map do |f|
    Rserve::REXP::Wrapper.wrap(self[f].to_a)
  end
  l = Rserve::Rlist.new(data, names.map(&:to_s))

  Rserve::REXP.create_data_frame(l)
end

#to_sObject



1703
1704
1705
# File 'lib/daru/dataframe.rb', line 1703

def to_s
  to_html
end

#transposeObject

Transpose a DataFrame, tranposing elements and row, column indexing.



1800
1801
1802
1803
1804
1805
1806
1807
# File 'lib/daru/dataframe.rb', line 1800

def transpose
  arrys = []
  each_vector do |vec|
    arrys << vec.to_a
  end

  Daru::DataFrame.new(arrys.transpose, index: @vectors, order: @index, dtype: @dtype, name: @name)
end

#updateObject

Method for updating the metadata (i.e. missing value positions) of the after assingment/deletion etc. are complete. This is provided so that time is not wasted in creating the metadata for the vector each time assignment/deletion of elements is done. Updating data this way is called lazy loading. To set or unset lazy loading, see the .lazy_update= method.



1712
1713
1714
# File 'lib/daru/dataframe.rb', line 1712

def update
  @data.each { |v| v.update } if Daru.lazy_update
end

#vectorObject

Access a vector or set/create a vector. Refer #[] and #[]= docs for details.

Usage

df.vector[:a] # access vector named ':a'
df.vector[:b] = [1,2,3] # set vector ':b' to [1,2,3]


368
369
370
# File 'lib/daru/dataframe.rb', line 368

def vector
  Daru::Accessors::DataFrameByVector.new(self)
end

#vector_by_calculation(&block) ⇒ Object

DSL for yielding each row and returning a Daru::Vector based on the value each run of the block returns.

Usage

a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
total = ds.vector_by_calculation { a + b + c }
# <Daru::Vector:82314050 @name = nil @size = 7 >
#   nil
# 0 111
# 1 222
# 2 333
# 3 444
# 4 555
# 5 666
# 6 777


979
980
981
982
983
984
985
986
# File 'lib/daru/dataframe.rb', line 979

def vector_by_calculation &block
  a = []
  each_row do |r|
    a.push r.instance_eval(&block)
  end

  Daru::Vector.new a, index: @index
end

#vector_count_characters(vecs = nil) ⇒ Object



1058
1059
1060
1061
1062
1063
1064
1065
1066
# File 'lib/daru/dataframe.rb', line 1058

def vector_count_characters vecs=nil
  vecs ||= @vectors.to_a

  collect_row_with_index do |row, i|
    vecs.inject(0) do |memo, vec|
      memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
    end
  end
end

#vector_mean(max_missing = 0) ⇒ Object

Calculate mean of the rows of the dataframe.

Arguments

  • max_missing - The maximum number of elements in the row that can be

zero for the mean calculation to happen. Default to 0.



1152
1153
1154
1155
1156
1157
1158
1159
1160
# File 'lib/daru/dataframe.rb', line 1152

def vector_mean max_missing=0
  mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"

  each_row_with_index do |row, i|
    mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
  end

  mean_vec
end

#vector_sum(vecs = nil) ⇒ Object

Returns a vector with sum of all vectors specified in the argument. Tf vecs parameter is empty, sum all numeric vector.



1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
# File 'lib/daru/dataframe.rb', line 1135

def vector_sum vecs=nil
  vecs ||= numeric_vectors
  sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype

  vecs.each do |n|
    sum += self[n]
  end

  sum
end

#verify(*tests) ⇒ Object

Test each row with one or more tests. Each test is a Proc with the form *Proc.new {|row| row > 0}*

The function returns an array with all errors.



935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
# File 'lib/daru/dataframe.rb', line 935

def verify(*tests)
  if(tests[0].is_a? Symbol)
    id = tests[0]
    tests.shift
  else
    id = @vectors.first
  end

  vr = []
  i  = 0
  each(:row) do |row|
    i += 1
    tests.each do |test|
      if !test[2].call(row)
        values = ""
        if test[1].size>0
          values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")"
        end
        vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
      end
    end
  end
  vr
end

#write_csv(filename, opts = {}) ⇒ Object

Write this DataFrame to a CSV file.

Arguements

  • filename - Path of CSV file where the DataFrame is to be saved.

Options

  • convert_comma - If set to true, will convert any commas in any

of the data to full stops (‘.’). All the options accepted by CSV.read() can also be passed into this function.



1736
1737
1738
# File 'lib/daru/dataframe.rb', line 1736

def write_csv filename, opts={}
  Daru::IO.dataframe_write_csv self, filename, opts
end

#write_excel(filename, opts = {}) ⇒ Object

Write this dataframe to an Excel Spreadsheet

Arguments

  • filename - The path of the file where the DataFrame should be written.



1745
1746
1747
# File 'lib/daru/dataframe.rb', line 1745

def write_excel filename, opts={}
  Daru::IO.dataframe_write_excel self, filename, opts
end

#write_sql(dbh, table) ⇒ Object

Insert each case of the Dataset on the selected table

Arguments

  • dbh - DBI database connection object.

  • query - Query string.

Usage

ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
ds.write_sql(dbh,"test")


1761
1762
1763
# File 'lib/daru/dataframe.rb', line 1761

def write_sql dbh, table
  Daru::IO.dataframe_write_sql self, dbh, table
end