Class: Daru::DataFrame

Inherits:
Object
  • Object
show all
Includes:
Maths::Arithmetic::DataFrame, Maths::Statistics::DataFrame, Plotting::DataFrame
Defined in:
lib/daru/monkeys.rb,
lib/daru/dataframe.rb,
lib/daru/extensions/rserve.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Plotting::DataFrame

#plot

Methods included from Maths::Statistics::DataFrame

#acf, #correlation, #count, #covariance, #cumsum, #describe, #ema, #max, #mean, #median, #min, #mode, #percent_change, #product, #range, #rolling_count, #rolling_max, #rolling_mean, #rolling_median, #rolling_min, #rolling_std, #rolling_variance, #standardize, #std, #sum, #variance_sample

Methods included from Maths::Arithmetic::DataFrame

#%, #*, #**, #+, #-, #/, #exp, #round, #sqrt

Constructor Details

#initialize(source, opts = {}) ⇒ DataFrame

DataFrame basically consists of an Array of Vector objects. These objects are indexed by row and column by vectors and index Index objects.

Arguments

  • source - Source from the DataFrame is to be initialized. Can be a Hash

of names and vectors (array or Daru::Vector), an array of arrays or array of Daru::Vectors.

Options

:order - An Array/Daru::Index/Daru::MultiIndex containing the order in which Vectors should appear in the DataFrame.

:index - An Array/Daru::Index/Daru::MultiIndex containing the order in which rows of the DataFrame will be named.

:name - A name for the DataFrame.

:clone - Specify as true or false. When set to false, and Vector objects are passed for the source, the Vector objects will not duplicated when creating the DataFrame. Will have no effect if Array is passed in the source, or if the passed Daru::Vectors have different indexes. Default to true.

Usage

df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
  index: [:a, :b, :c, :d], name: :spider_man)

# =>
# <Daru::DataFrame:80766980 @name = spider_man @size = 4>
#             b          a
#  a          6          1
#  b          7          2
#  c          8          3
#  d          9          4


240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# File 'lib/daru/dataframe.rb', line 240

def initialize source, opts={}
  vectors = opts[:order]
  index   = opts[:index]
  clone   = opts[:clone] == false ? false : true
  @data   = []

  temp_name = opts[:name]
  @name = temp_name || SecureRandom.uuid

  if source.empty?
    @vectors = try_create_index vectors
    @index   = try_create_index index
    create_empty_vectors
  else
    case source
    when Array
      if source.all? { |s| s.is_a?(Array) }
        raise ArgumentError, "Number of vectors (#{vectors.size}) should \
          equal order size (#{source.size})" if source.size != vectors.size

        @index   = try_create_index(index || source[0].size)
        @vectors = try_create_index(vectors)

        @vectors.each_with_index do |_vec,idx|
          @data << Daru::Vector.new(source[idx], index: @index)
        end
      elsif source.all? { |s| s.is_a?(Daru::Vector) }
        hsh = {}
        vectors.each_with_index do |name, idx|
          hsh[name] = source[idx]
        end
        initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
      else # array of hashes
        @vectors =
          if vectors.nil?
            Daru::Index.new source[0].keys
          else
            Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
          end
        @index = Daru::Index.new(index || source.size)

        @vectors.each do |name|
          v = []
          source.each do |h|
            v << (h[name] || h[name.to_s])
          end

          @data << Daru::Vector.new(v, name: set_name(name), index: @index)
        end
      end
    when Hash
      create_vectors_index_with vectors, source
      if all_daru_vectors_in_source? source
        vectors_have_same_index = all_vectors_have_equal_indexes?(source)
        if !index.nil?
          @index = try_create_index index
        elsif vectors_have_same_index
          @index = source.values[0].index.dup
        else
          all_indexes = []
          source.each_value do |vector|
            all_indexes << vector.index.to_a
          end
          # sort only if missing indexes detected
          all_indexes.flatten!.uniq!.sort!

          @index = Daru::Index.new all_indexes
          clone = true
        end

        if clone
          @vectors.each do |vector|
            # avoids matching indexes of vectors if all the supplied vectors
            # have the same index.
            if vectors_have_same_index
              v = source[vector].dup
            else
              v = Daru::Vector.new([], name: vector, metadata: source[vector]..dup, index: @index)

              @index.each do |idx|
                v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
              end
            end
            @data << v
          end
        else
          @data.concat source.values
        end
      else
        @index = try_create_index(index || source.values[0].size)

        @vectors.each do |name|
          meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name]..dup} : {}
          @data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
        end
      end
    end
  end

  set_size
  validate
  update
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args, &block) ⇒ Object



2061
2062
2063
2064
2065
2066
2067
2068
2069
# File 'lib/daru/dataframe.rb', line 2061

def method_missing(name, *args, &block)
  if name =~ /(.+)\=/
    insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
  elsif has_vector? name
    self[name]
  else
    super(name, *args, &block)
  end
end

Instance Attribute Details

#indexObject

The index of the rows of the DataFrame



196
197
198
# File 'lib/daru/dataframe.rb', line 196

def index
  @index
end

#nameObject (readonly)

The name of the DataFrame



199
200
201
# File 'lib/daru/dataframe.rb', line 199

def name
  @name
end

#sizeObject (readonly)

The number of rows present in the DataFrame



202
203
204
# File 'lib/daru/dataframe.rb', line 202

def size
  @size
end

#vectorsObject

The vectors (columns) index of the DataFrame



193
194
195
# File 'lib/daru/dataframe.rb', line 193

def vectors
  @vectors
end

Class Method Details

._load(data) ⇒ Object



1985
1986
1987
1988
1989
1990
1991
# File 'lib/daru/dataframe.rb', line 1985

def self._load data
  h = Marshal.load data
  Daru::DataFrame.new(h[:data],
    index: h[:index],
    order: h[:order],
    name:  h[:name])
end

.crosstab_by_assignation(rows, columns, values) ⇒ Object

Generates a new dataset, using three vectors

  • Rows

  • Columns

  • Values

For example, you have these values

x   y   v
a   a   0
a   b   1
b   a   1
b   b   0

You obtain

id  a   b
 a  0   1
 b  1   0

Useful to process outputs from databases



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/daru/dataframe.rb', line 160

def crosstab_by_assignation rows, columns, values
  raise 'Three vectors should be equal size' if
    rows.size != columns.size || rows.size!=values.size

  cols_values = columns.factors
  cols_n      = cols_values.size

  h_rows = rows.factors.each_with_object({}) do |v, a|
    a[v] = cols_values.each_with_object({}) do |v1, a1|
      a1[v1]=nil
    end
  end

  values.each_index do |i|
    h_rows[rows[i]][columns[i]] = values[i]
  end
  df = Daru::DataFrame.new({}, order: [:_id] + cols_values.to_a)

  rows.factors.each do |row|
    n_row = Array.new(cols_n+1)
    n_row[0] = row
    cols_values.each_index do |i|
      n_row[i+1] = h_rows[row][cols_values[i]]
    end

    df.add_row(n_row)
  end
  df.update
  df
end

.from_activerecord(relation, *fields) ⇒ Object

Read a dataframe from AR::Relation

USE:

# When Post model is defined as:
class Post < ActiveRecord::Base
  scope :active, -> { where.not(published_at: nil) }
end

# You can load active posts into a dataframe by:
Daru::DataFrame.from_activerecord(Post.active, :title, :published_at)

Parameters:

  • relation (ActiveRecord::Relation)

    An AR::Relation object from which data is loaded

Returns:

  • A dataframe containing the data loaded from the relation



92
93
94
# File 'lib/daru/dataframe.rb', line 92

def from_activerecord relation, *fields
  Daru::IO.from_activerecord relation, *fields
end

.from_csv(path, opts = {}, &block) ⇒ Object

Load data from a CSV file. Specify an optional block to grab the CSV object and pre-condition it (for example use the `convert` or `header_convert` methods).

Arguments

  • path - Path of the file to load specified as a String.

Options

Accepts the same options as the Daru::DataFrame constructor and CSV.open() and uses those to eventually construct the resulting DataFrame.

Verbose Description

You can specify all the options to the `.from_csv` function that you do to the Ruby `CSV.read()` function, since this is what is used internally.

For example, if the columns in your CSV file are separated by something other that commas, you can use the `:col_sep` option. If you want to convert numeric values to numbers and not keep them as strings, you can use the `:converters` option and set it to `:numeric`.

The `.from_csv` function uses the following defaults for reading CSV files (that are passed into the `CSV.read()` function):

{
  :col_sep           => ',',
  :converters        => :numeric
}


44
45
46
# File 'lib/daru/dataframe.rb', line 44

def from_csv path, opts={}, &block
  Daru::IO.from_csv path, opts, &block
end

.from_excel(path, opts = {}, &block) ⇒ Object

Read data from an Excel file into a DataFrame.

Arguments

  • path - Path of the file to be read.

Options

*:worksheet_id - ID of the worksheet that is to be read.



57
58
59
# File 'lib/daru/dataframe.rb', line 57

def from_excel path, opts={}, &block
  Daru::IO.from_excel path, opts, &block
end

.from_plaintext(path, fields) ⇒ Object

Read the database from a plaintext file. For this method to work, the data should be present in a plain text file in columns. See spec/fixtures/bank2.dat for an example.

Arguments

  • path - Path of the file to be read.

  • fields - Vector names of the resulting database.

Usage

df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]


108
109
110
# File 'lib/daru/dataframe.rb', line 108

def from_plaintext path, fields
  Daru::IO.from_plaintext path, fields
end

.from_sql(dbh, query) ⇒ Object

Read a database query and returns a Dataset

USE:

dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")

Parameters:

  • dbh (DBI::DatabaseHandle)

    A DBI connection to be used to run the query

  • query (String)

    The query to be executed

Returns:

  • A dataframe containing the data resulting from the query



72
73
74
# File 'lib/daru/dataframe.rb', line 72

def from_sql dbh, query
  Daru::IO.from_sql dbh, query
end

.rows(source, opts = {}) ⇒ Object

Create DataFrame by specifying rows as an Array of Arrays or Array of Daru::Vector objects.

Raises:



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/daru/dataframe.rb', line 114

def rows source, opts={}
  first = source.first

  raise SizeError, 'All vectors must have same length' \
    unless source.all? { |v| v.size == first.size }

  index = []
  opts[:order] ||=
    case first
    when Daru::Vector # assume that all are Vectors
      index = source.map(&:name)
      first.index.to_a
    when Array
      Array.new(first.size, &:to_s)
    end

  if source.all? { |s| s.is_a?(Array) }
    Daru::DataFrame.new(source.transpose, opts)
  else # array of Daru::Vectors
    Daru::DataFrame.new({}, opts).tap do |df|
      source.each_with_index do |row, idx|
        df[index[idx] || idx, :row] = row
      end
    end
  end
end

Instance Method Details

#==(other) ⇒ Object



2053
2054
2055
2056
2057
2058
2059
# File 'lib/daru/dataframe.rb', line 2053

def == other
  self.class == other.class   &&
    @size    == other.size    &&
    @index   == other.index   &&
    @vectors == other.vectors &&
    @vectors.to_a.all? { |v| self[v] == other[v] }
end

#[](*names) ⇒ Object

Access row or vector. Specify name of row/vector followed by axis(:row, :vector). Defaults to :vector. Use of this method is not recommended for accessing rows. Use df.row for accessing row with index ':a'.



352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
# File 'lib/daru/dataframe.rb', line 352

def [](*names)
  if names[-1] == :vector || names[-1] == :row
    axis = names[-1]
    names = names[0..-2]
  else
    axis = :vector
  end

  if axis == :vector
    access_vector(*names)
  elsif axis == :row
    access_row(*names)
  else
    raise IndexError, "Expected axis to be row or vector not #{axis}"
  end
end

#[]=(*args) ⇒ Object

Insert a new row/vector of the specified name or modify a previous row. Instead of using this method directly, use df.row = [1,2,3] to set/create a row ':a' to [1,2,3], or df.vector = [1,2,3] for vectors.

In case a Daru::Vector is specified after the equality the sign, the indexes of the vector will be matched against the row/vector indexes of the DataFrame before an insertion is performed. Unmatched indexes will be set to nil.



376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# File 'lib/daru/dataframe.rb', line 376

def []=(*args)
  axis = args.include?(:row) ? :row : :vector
  args.delete :vector
  args.delete :row

  name = args[0..-2]
  vector = args[-1]

  if axis == :vector
    insert_or_modify_vector name, vector
  elsif axis == :row
    insert_or_modify_row name, vector
  else
    raise IndexError, "Expected axis to be row or vector, not #{axis}."
  end
end

#_dump(_depth) ⇒ Object



1976
1977
1978
1979
1980
1981
1982
1983
# File 'lib/daru/dataframe.rb', line 1976

def _dump(_depth)
  Marshal.dump(
    data:  @data,
    index: @index.to_a,
    order: @vectors.to_a,
    name:  @name
  )
end

#add_row(row, index = nil) ⇒ Object



398
399
400
# File 'lib/daru/dataframe.rb', line 398

def add_row row, index=nil
  self.row[index || @size] = row
end

#add_vector(n, vector) ⇒ Object



402
403
404
# File 'lib/daru/dataframe.rb', line 402

def add_vector n, vector
  self[n] = vector
end

#add_vectors_by_split(name, join = '-', sep = Daru::SPLIT_TOKEN) ⇒ Object



1075
1076
1077
1078
# File 'lib/daru/dataframe.rb', line 1075

def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
  split = self[name].split_by_separator(sep)
  split.each { |k,v| self[(name.to_s + join + k.to_s).to_sym] = v }
end

#add_vectors_by_split_recode(name_, join = '-', sep = Daru::SPLIT_TOKEN) ⇒ Object



1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
# File 'lib/daru/dataframe.rb', line 1761

def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN)
  split = self[name_].split_by_separator(sep)
  i = 1
  split.each { |k,v|
    new_field = name_.to_s + join + i.to_s
    v.rename name_.to_s + ':' + k.to_s
    self[new_field.to_sym] = v
    i += 1
  }
end

#all?(axis = :vector, &block) ⇒ Boolean

Works like Array#all?

Examples:

Using all?

df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
df.all?(:row) do |row|
  row[:a] < 10
end #=> true

Parameters:

  • axis (Symbol) (defaults to: :vector)

    (:vector) The axis to iterate over. Can be :vector or :row. A Daru::Vector object is yielded in the block.

Returns:

  • (Boolean)


1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
# File 'lib/daru/dataframe.rb', line 1131

def all? axis=:vector, &block
  if axis == :vector || axis == :column
    @data.all?(&block)
  elsif axis == :row
    each_row do |row|
      return false unless yield(row)
    end
    return true
  else
    raise ArgumentError, "Unidentified axis #{axis}"
  end
end

#any?(axis = :vector, &block) ⇒ Boolean

Works like Array#any?.

Examples:

Using any?

df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
df.any?(:row) do |row|
  row[:a] < 3 and row[:b] == 'b'
end #=> true

Parameters:

  • axis (Symbol) (defaults to: :vector)

    (:vector) The axis to iterate over. Can be :vector or :row. A Daru::Vector object is yielded in the block.

Returns:

  • (Boolean)


1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
# File 'lib/daru/dataframe.rb', line 1109

def any? axis=:vector, &block
  if axis == :vector || axis == :column
    @data.any?(&block)
  elsif axis == :row
    each_row do |row|
      return true if yield(row)
    end
    return false
  else
    raise ArgumentError, "Unidentified axis #{axis}"
  end
end

#bootstrap(n = nil) ⇒ Daru::DataFrame

Creates a DataFrame with the random data, of n size. If n not given, uses original number of rows.

Returns:



877
878
879
880
881
882
883
884
885
# File 'lib/daru/dataframe.rb', line 877

def bootstrap(n=nil)
  n ||= nrows
  ds_boot = Daru::DataFrame.new({}, order: @vectors)
  n.times do
    ds_boot.add_row(row[rand(n)])
  end
  ds_boot.update
  ds_boot
end

#clone(*vectors_to_clone) ⇒ Object

Returns a 'view' of the DataFrame, i.e the object ID's of vectors are preserved.

Arguments

vectors_to_clone - Names of vectors to clone. Optional. Will return a view of the whole data frame otherwise.



445
446
447
448
449
450
451
452
453
# File 'lib/daru/dataframe.rb', line 445

def clone *vectors_to_clone
  vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
  vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?

  h = vectors_to_clone.each_with_object({}) do |vec, hsh|
    hsh[vec] = self[vec]
  end
  Daru::DataFrame.new(h, clone: false)
end

#clone_only_validObject

Returns a 'shallow' copy of DataFrame if missing data is not present, or a full copy of only valid data if missing data is present.



457
458
459
460
461
462
463
# File 'lib/daru/dataframe.rb', line 457

def clone_only_valid
  if has_missing_data?
    dup_only_valid
  else
    clone
  end
end

#clone_structureObject

Only clone the structure of the DataFrame.



434
435
436
# File 'lib/daru/dataframe.rb', line 434

def clone_structure
  Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
end

#collect(axis = :vector, &block) ⇒ Object

Iterate over a row or vector and return results in a Daru::Vector. Specify axis with :vector or :row. Default to :vector.

Description

The #collect iterator works similar to #map, the only difference being that it returns a Daru::Vector comprising of the results of each block run. The resultant Vector has the same index as that of the axis over which collect has iterated. It also accepts the optional axis argument.

Arguments

  • axis - The axis to iterate over. Can be :vector (or :column)

or :row. Default to :vector.



567
568
569
570
571
572
573
574
575
# File 'lib/daru/dataframe.rb', line 567

def collect axis=:vector, &block
  if axis == :vector || axis == :column
    collect_vectors(&block)
  elsif axis == :row
    collect_rows(&block)
  else
    raise ArgumentError, "Unknown axis #{axis}"
  end
end

#collect_matrix::Matrix

Generate a matrix, based on vector names of the DataFrame.

Returns:



830
831
832
833
834
835
836
837
838
839
840
841
# File 'lib/daru/dataframe.rb', line 830

def collect_matrix
  return to_enum(:collect_matrix) unless block_given?

  vecs = vectors.to_a
  rows = vecs.collect { |row|
    vecs.collect { |col|
      yield row,col
    }
  }

  Matrix.rows(rows)
end

#collect_row_with_indexObject



792
793
794
795
796
797
798
799
800
801
# File 'lib/daru/dataframe.rb', line 792

def collect_row_with_index
  return to_enum(:collect_row_with_index) unless block_given?

  data = []
  each_row_with_index do |row, i|
    data.push yield(row, i)
  end

  Daru::Vector.new(data, index: @index)
end

#collect_rowsObject

Retrieves a Daru::Vector, based on the result of calculation performed on each row.



781
782
783
784
785
786
787
788
789
790
# File 'lib/daru/dataframe.rb', line 781

def collect_rows
  return to_enum(:collect_rows) unless block_given?

  data = []
  each_row do |row|
    data.push yield(row)
  end

  Daru::Vector.new(data, index: @index)
end

#collect_vector_with_indexObject



816
817
818
819
820
821
822
823
824
825
# File 'lib/daru/dataframe.rb', line 816

def collect_vector_with_index
  return to_enum(:collect_vector_with_index) unless block_given?

  data = []
  each_vector_with_index do |vec, i|
    data.push yield(vec, i)
  end

  Daru::Vector.new(data, index: @vectors)
end

#collect_vectorsObject

Retrives a Daru::Vector, based on the result of calculation performed on each vector.



805
806
807
808
809
810
811
812
813
814
# File 'lib/daru/dataframe.rb', line 805

def collect_vectors
  return to_enum(:collect_vectors) unless block_given?

  data = []
  each_vector do |vec|
    data.push yield(vec)
  end

  Daru::Vector.new(data, index: @vectors)
end

#column(name) ⇒ Object

Access a vector by name.



394
395
396
# File 'lib/daru/dataframe.rb', line 394

def column name
  vector[name]
end

#compute(text, &block) ⇒ Object

Returns a vector, based on a string with a calculation based on vector.

The calculation will be eval'ed, so you can put any variable or expression valid on ruby.

For example:

a = Daru::Vector.new [1,2]
b = Daru::Vector.new [3,4]
ds = Daru::DataFrame.new({:a => a,:b => b})
ds.compute("a+b")
=> Vector [4,6]


1007
1008
1009
1010
# File 'lib/daru/dataframe.rb', line 1007

def compute text, &block
  return instance_eval(&block) if block_given?
  instance_eval(text)
end

#concat(other_df) ⇒ Object

Concatenate another DataFrame along corresponding columns. If columns do not exist in both dataframes, they are filled with nils



1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
# File 'lib/daru/dataframe.rb', line 1238

def concat other_df
  vectors = @vectors.to_a
  data = []

  vectors.each do |v|
    other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
    data << self[v].dup.to_a.concat(other_vec)
  end

  other_df.vectors.each do |v|
    next if vectors.include?(v)
    vectors << v
    data << ([nil] * size).concat(other_df[v].to_a)
  end

  Daru::DataFrame.new(data, order: vectors)
end

#create_sql(table, charset = 'UTF8') ⇒ Object

Create a sql, basen on a given Dataset

Arguments

  • table - String specifying name of the table that will created in SQL.

  • charset - Character set. Default is “UTF8”.

Examples:


ds = Daru::DataFrame.new({
 :id   => Daru::Vector.new([1,2,3,4,5]),
 :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
})
ds.create_sql('names')
 #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"


1788
1789
1790
1791
1792
1793
1794
1795
1796
# File 'lib/daru/dataframe.rb', line 1788

def create_sql(table,charset='UTF8')
  sql    = "CREATE TABLE #{table} ("
  fields = vectors.to_a.collect do |f|
    v = self[f]
    f.to_s + ' ' + v.db_type
  end

  sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
end

#delete_row(index) ⇒ Object

Delete a row

Raises:

  • (IndexError)


861
862
863
864
865
866
867
868
869
870
871
# File 'lib/daru/dataframe.rb', line 861

def delete_row index
  idx = named_index_for index

  raise IndexError, "Index #{index} does not exist." unless @index.include? idx
  @index = Daru::Index.new(@index.to_a - [idx])
  each_vector do |vector|
    vector.delete_at idx
  end

  set_size
end

#delete_vector(vector) ⇒ Object

Delete a vector

Raises:

  • (IndexError)


844
845
846
847
848
849
850
851
# File 'lib/daru/dataframe.rb', line 844

def delete_vector vector
  raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)

  @data.delete_at @vectors[vector]
  @vectors = Daru::Index.new @vectors.to_a - [vector]

  self
end

#delete_vectors(*vectors) ⇒ Object

Deletes a list of vectors



854
855
856
857
858
# File 'lib/daru/dataframe.rb', line 854

def delete_vectors *vectors
  Array(vectors).each { |vec| delete_vector vec }

  self
end

#dup(vectors_to_dup = nil) ⇒ Object

Duplicate the DataFrame entirely.

Arguments

  • vectors_to_dup - An Array specifying the names of Vectors to

be duplicated. Will duplicate the entire DataFrame if not specified.



421
422
423
424
425
426
427
428
429
430
431
# File 'lib/daru/dataframe.rb', line 421

def dup vectors_to_dup=nil
  vectors_to_dup = @vectors.to_a unless vectors_to_dup

  src = []
  vectors_to_dup.each do |vec|
    src << @data[@vectors[vec]].dup
  end
  new_order = Daru::Index.new(vectors_to_dup)

  Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
end

#dup_only_valid(vecs = nil) ⇒ Object

Creates a new duplicate dataframe containing only rows without a single missing value.



467
468
469
470
471
472
473
474
# File 'lib/daru/dataframe.rb', line 467

def dup_only_valid vecs=nil
  rows_with_nil = @data.each_with_object([]) do |vector, memo|
    memo.concat vector.missing_positions
  end.uniq

  row_indexes = @index.to_a
  (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
end

#each(axis = :vector, &block) ⇒ Object

Iterate over each row or vector of the DataFrame. Specify axis by passing :vector or :row as the argument. Default to :vector.

Description

`#each` works exactly like Array#each. The default mode for `each` is to iterate over the columns of the DataFrame. To iterate over rows you must pass the axis, i.e `:row` as an argument.

Arguments

  • axis - The axis to iterate over. Can be :vector (or :column)

or :row. Default to :vector.



542
543
544
545
546
547
548
549
550
# File 'lib/daru/dataframe.rb', line 542

def each axis=:vector, &block
  if axis == :vector || axis == :column
    each_vector(&block)
  elsif axis == :row
    each_row(&block)
  else
    raise ArgumentError, "Unknown axis #{axis}"
  end
end

#each_index(&block) ⇒ Object

Iterate over each index of the DataFrame.



477
478
479
480
481
482
# File 'lib/daru/dataframe.rb', line 477

def each_index &block
  return to_enum(:each_index) unless block_given?

  @index.each(&block)
  self
end

#each_rowObject

Iterate over each row



509
510
511
512
513
514
515
516
517
# File 'lib/daru/dataframe.rb', line 509

def each_row
  return to_enum(:each_row) unless block_given?

  @index.each do |index|
    yield access_row(index)
  end

  self
end

#each_row_with_indexObject



519
520
521
522
523
524
525
526
527
# File 'lib/daru/dataframe.rb', line 519

def each_row_with_index
  return to_enum(:each_row_with_index) unless block_given?

  @index.each do |index|
    yield access_row(index), index
  end

  self
end

#each_vector(&block) ⇒ Object Also known as: each_column

Iterate over each vector



485
486
487
488
489
490
491
# File 'lib/daru/dataframe.rb', line 485

def each_vector(&block)
  return to_enum(:each_vector) unless block_given?

  @data.each(&block)

  self
end

#each_vector_with_indexObject Also known as: each_column_with_index

Iterate over each vector alongwith the name of the vector



496
497
498
499
500
501
502
503
504
# File 'lib/daru/dataframe.rb', line 496

def each_vector_with_index
  return to_enum(:each_vector_with_index) unless block_given?

  @vectors.each do |vector|
    yield @data[@vectors[vector]], vector
  end

  self
end

#filter(axis = :vector, &block) ⇒ Object

Retain vectors or rows if the block returns a truthy value.

Description

For filtering out certain rows/vectors based on their values, use the #filter method. By default it iterates over vectors and keeps those vectors for which the block returns true. It accepts an optional axis argument which lets you specify whether you want to iterate over vectors or rows.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.

Usage

# Filter vectors

df.filter do |vector|
  vector.type == :numeric and vector.median < 50
end

# Filter rows

df.filter(:row) do |row|
  row[:a] + row[:d] < 100
end


672
673
674
675
676
677
678
# File 'lib/daru/dataframe.rb', line 672

def filter axis=:vector, &block
  if axis == :vector || axis == :column
    filter_vectors(&block)
  elsif axis == :row
    filter_rows(&block)
  end
end

#filter_rowsObject

Iterates over each row and retains it in a new DataFrame if the block returns true for that row.



920
921
922
923
924
925
926
# File 'lib/daru/dataframe.rb', line 920

def filter_rows
  return to_enum(:filter_rows) unless block_given?

  keep_rows = @index.map { |index| yield access_row(index) }

  where keep_rows
end

#filter_vector(vec) ⇒ Object

creates a new vector with the data of a given field which the block returns true



909
910
911
912
913
914
915
916
# File 'lib/daru/dataframe.rb', line 909

def filter_vector vec
  d = []
  each_row do |row|
    d.push(row[vec]) if yield row
  end

  Daru::Vector.new(d, metadata: self[vec]..dup)
end

#filter_vectors(&block) ⇒ Object

Iterates over each vector and retains it in a new DataFrame if the block returns true for that vector.



930
931
932
933
934
935
936
937
# File 'lib/daru/dataframe.rb', line 930

def filter_vectors &block
  return to_enum(:filter_vectors) unless block_given?

  df = dup
  df.keep_vector_if(&block)

  df
end

#group_by(*vectors) ⇒ Object

Group elements by vector to perform operations on them. Returns a Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed list of possible operations.

Arguments

  • vectors - An Array contatining names of vectors to group by.

Usage

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a,:b,:c]).groups
#=> {["bar", "one", 2]=>[1],
# ["bar", "three", 1]=>[3],
# ["bar", "two", 6]=>[5],
# ["foo", "one", 1]=>[0],
# ["foo", "one", 3]=>[6],
# ["foo", "three", 8]=>[7],
# ["foo", "two", 3]=>[2, 4]}


1215
1216
1217
1218
1219
1220
1221
1222
# File 'lib/daru/dataframe.rb', line 1215

def group_by *vectors
  vectors.flatten!
  vectors.each { |v|
    raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
  }

  Daru::Core::GroupBy.new(self, vectors)
end

#has_missing_data?Boolean Also known as: flawed?

Returns:

  • (Boolean)


1031
1032
1033
# File 'lib/daru/dataframe.rb', line 1031

def has_missing_data?
  !!@data.any?(&:has_missing_data?)
end

#has_vector?(vector) ⇒ Boolean

Check if a vector is present

Returns:

  • (Boolean)


1096
1097
1098
# File 'lib/daru/dataframe.rb', line 1096

def has_vector? vector
  @vectors.include? vector
end

#head(quantity = 10) ⇒ Object Also known as: first

The first ten elements of the DataFrame

Parameters:

  • quantity (Fixnum) (defaults to: 10)

    (10) The number of elements to display from the top.



1147
1148
1149
# File 'lib/daru/dataframe.rb', line 1147

def head quantity=10
  self[0..(quantity-1), :row]
end

#inspect(spacing = 10, threshold = 15) ⇒ Object

Pretty print in a nice table format for the command line (irb/pry/iruby)



2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
# File 'lib/daru/dataframe.rb', line 2015

def inspect spacing=10, threshold=15
  longest = [@name.to_s.size,
             (@vectors.map(&:to_s).map(&:size).max || 0),
             (@index  .map(&:to_s).map(&:size).max || 0),
             (@data   .map { |v| v.map(&:to_s).map(&:size).max }.max || 0)].max

  name      = @name || 'nil'
  content   = ''
  longest   = spacing if longest > spacing
  formatter = "\n"

  (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
  content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
             name.to_s + ' @size = ' + @size.to_s + '>'
  content += formatter % ['', *@vectors.map(&:to_s)]
  row_num  = 1

  each_row_with_index do |row, index|
    content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
    row_num += 1
    next if row_num <= threshold

    dots = []

    (@vectors.size + 1).times { dots << '...' }
    content += formatter % dots
    break
  end
  content += "\n"

  content
end

#join(other_df, opts = {}) ⇒ Daru::DataFrame

Join 2 DataFrames with SQL style joins. Currently supports inner, left outer, right outer and full outer joins.

Examples:

Inner Join

left = Daru::DataFrame.new({
  :id   => [1,2,3,4],
  :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
})
right = Daru::DataFrame.new({
  :id => [1,2,3,4],
  :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
})
left.join(right, how: :inner, on: [:name])
#=>
##<Daru::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
#                 id_1       name       id_2
#         0          1     Pirate          2
#         1          3      Ninja          4

Parameters:

  • other_df (Daru::DataFrame)

    Another DataFrame on which the join is to be performed.

  • opts (Hash) (defaults to: {})

    Options Hash

  • :how (Hash)

    a customizable set of options

  • :on (Hash)

    a customizable set of options

Returns:



1682
1683
1684
# File 'lib/daru/dataframe.rb', line 1682

def join(other_df,opts={})
  Daru::Core::Merge.join(self, other_df, opts)
end

#keep_row_ifObject



887
888
889
890
891
892
893
894
895
896
897
898
# File 'lib/daru/dataframe.rb', line 887

def keep_row_if
  deletion = []

  @index.each do |index|
    keep_row = yield access_row(index)

    deletion << index unless keep_row
  end
  deletion.each { |idx|
    delete_row idx
  }
end

#keep_vector_ifObject



900
901
902
903
904
905
906
# File 'lib/daru/dataframe.rb', line 900

def keep_vector_if
  @vectors.each do |vector|
    keep_vector = yield @data[@vectors[vector]], vector

    delete_vector vector unless keep_vector
  end
end

#map(axis = :vector, &block) ⇒ Object

Map over each vector or row of the data frame according to the argument specified. Will return an Array of the resulting elements. To map over each row/vector and get a DataFrame, see #recode.

Description

The #map iterator works like Array#map. The value returned by each run of the block is added to an Array and the Array is returned. This method also accepts an axis argument, like #each. The default is :vector.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



593
594
595
596
597
598
599
600
601
# File 'lib/daru/dataframe.rb', line 593

def map axis=:vector, &block
  if axis == :vector || axis == :column
    map_vectors(&block)
  elsif axis == :row
    map_rows(&block)
  else
    raise ArgumentError, "Unknown axis #{axis}"
  end
end

#map!(axis = :vector, &block) ⇒ Object

Destructive map. Modifies the DataFrame. Each run of the block must return a Daru::Vector. You can specify the axis to map over as the argument. Default to :vector.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



611
612
613
614
615
616
617
# File 'lib/daru/dataframe.rb', line 611

def map! axis=:vector, &block
  if axis == :vector || axis == :column
    map_vectors!(&block)
  elsif axis == :row
    map_rows!(&block)
  end
end

#map_rowsObject

Map each row



745
746
747
748
749
750
751
752
753
754
# File 'lib/daru/dataframe.rb', line 745

def map_rows
  return to_enum(:map_rows) unless block_given?

  dt = []
  each_row do |row|
    dt << yield(row)
  end

  dt
end

#map_rows!Object



767
768
769
770
771
772
773
774
775
776
777
# File 'lib/daru/dataframe.rb', line 767

def map_rows!
  return to_enum(:map_rows!) unless block_given?

  index.dup.each do |i|
    r = yield row[i]
    r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
    row[i] = r
  end

  self
end

#map_rows_with_indexObject



756
757
758
759
760
761
762
763
764
765
# File 'lib/daru/dataframe.rb', line 756

def map_rows_with_index
  return to_enum(:map_rows_with_index) unless block_given?

  dt = []
  each_row_with_index do |row, index|
    dt << yield(row, index)
  end

  dt
end

#map_vectorsObject

Map each vector and return an Array.



708
709
710
711
712
713
714
715
716
717
# File 'lib/daru/dataframe.rb', line 708

def map_vectors
  return to_enum(:map_vectors) unless block_given?

  arry = []
  @data.each do |vec|
    arry << yield(vec)
  end

  arry
end

#map_vectors!Object

Destructive form of #map_vectors



720
721
722
723
724
725
726
727
728
729
730
# File 'lib/daru/dataframe.rb', line 720

def map_vectors!
  return to_enum(:map_vectors!) unless block_given?

  vectors.dup.each do |n|
    v = yield self[n]
    v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}"
    self[n] = v
  end

  self
end

#map_vectors_with_indexObject

Map vectors alongwith the index.



733
734
735
736
737
738
739
740
741
742
# File 'lib/daru/dataframe.rb', line 733

def map_vectors_with_index
  return to_enum(:map_vectors_with_index) unless block_given?

  dt = []
  each_vector_with_index do |vector, name|
    dt << yield(vector, name)
  end

  dt
end

#merge(other_df) ⇒ Daru::DataFrame

Merge vectors from two DataFrames. In case of name collision, the vectors names are changed to x_1, x_2 .…

Returns:



1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
# File 'lib/daru/dataframe.rb', line 1640

def merge other_df
  raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows

  new_fields = (@vectors.to_a + other_df.vectors.to_a)
               .recode_repeated
               .map(&:to_sym)
  df_new     = DataFrame.new({}, order: new_fields)

  (0...nrows).to_a.each do |i|
    row = self.row[i].to_a + other_df.row[i].to_a
    df_new.add_row(row)
  end

  df_new.update
  df_new
end

#missing_values_rows(missing_values = [nil]) ⇒ Object Also known as: vector_missing_values

Return a vector with the number of missing values in each row.

Arguments

  • missing_values - An Array of the values that should be

treated as 'missing'. The default missing value is nil.



1018
1019
1020
1021
1022
1023
1024
1025
1026
# File 'lib/daru/dataframe.rb', line 1018

def missing_values_rows missing_values=[nil]
  number_of_missing = []
  each_row do |row|
    row.missing_values = missing_values
    number_of_missing << row.missing_positions.size
  end

  Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
end

#ncolsObject

The number of vectors



1091
1092
1093
# File 'lib/daru/dataframe.rb', line 1091

def ncols
  shape[1]
end

#nest(*tree_keys, &block) ⇒ Object

Return a nested hash using vector names as keys and an array constructed of hashes with other values. If block provided, is used to provide the values, with parameters row of dataset, current last hash on hierarchy and name of the key to include



1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
# File 'lib/daru/dataframe.rb', line 1041

def nest *tree_keys, &block
  tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
  out = {}

  each_row do |row|
    current = out
    # Create tree
    tree_keys[0, tree_keys.size-1].each do |f|
      root = row[f]
      current[root] ||= {}
      current = current[root]
    end
    name = row[tree_keys.last]
    if !block
      current[name] ||= []
      current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
    else
      current[name] = yield(row, current, name)
    end
  end

  out
end

#nrowsObject

The number of rows



1086
1087
1088
# File 'lib/daru/dataframe.rb', line 1086

def nrows
  shape[0]
end

#numeric_vector_namesObject



1372
1373
1374
1375
1376
1377
1378
1379
# File 'lib/daru/dataframe.rb', line 1372

def numeric_vector_names
  numerics = []

  @vectors.each do |v|
    numerics << v if self[v].type == :numeric
  end
  numerics
end

#numeric_vectorsObject

Return the indexes of all the numeric vectors. Will include vectors with nils alongwith numbers.



1363
1364
1365
1366
1367
1368
1369
1370
# File 'lib/daru/dataframe.rb', line 1363

def numeric_vectors
  numerics = []

  each_vector_with_index do |vec, i|
    numerics << i if vec.type == :numeric
  end
  numerics
end

#one_to_many(parent_fields, pattern) ⇒ Object

Creates a new dataset for one to many relations on a dataset, based on pattern of field names.

for example, you have a survey for number of children with this structure:

id, name, child_name_1, child_age_1, child_name_2, child_age_2

with

ds.one_to_many([:id], "child_%v_%n"

the field of first parameters will be copied verbatim to new dataset, and fields which responds to second pattern will be added one case for each different %n.

Examples:

cases=[
  ['1','george','red',10,'blue',20,nil,nil],
  ['2','fred','green',15,'orange',30,'white',20],
  ['3','alfred',nil,nil,nil,nil,nil,nil]
]
ds=Daru::DataFrame.rows(cases, order: [:id, :name, :car_color1, :car_value1, :car_color2, :car_value2, :car_color3, :car_value3])
ds.one_to_many([:id],'car_%v%n').to_matrix
#=> Matrix[
#   ["red", "1", 10],
#   ["blue", "1", 20],
#   ["green", "2", 15],
#   ["orange", "2", 30],
#   ["white", "2", 20]
#   ]


1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
# File 'lib/daru/dataframe.rb', line 1713

def one_to_many(parent_fields, pattern)
  re      = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
  ds_vars = parent_fields.dup
  vars    = []
  max_n   = 0
  h       = parent_fields.each_with_object({}) { |v, a|
    a[v] = Daru::Vector.new([])
  }
  # Adding _row_id
  h['_col_id'] = Daru::Vector.new([])
  ds_vars.push('_col_id')

  @vectors.each do |f|
    next unless f =~ re
    unless vars.include? $1
      vars.push($1)
      h[$1] = Daru::Vector.new([])
    end

    max_n = $2.to_i if max_n < $2.to_i
  end
  ds = DataFrame.new(h, order: ds_vars+vars)

  each_row do |row|
    row_out = {}
    parent_fields.each do |f|
      row_out[f] = row[f]
    end

    max_n.times do |n1|
      n = n1+1
      any_data = false
      vars.each do |v|
        data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
        row_out[v] = data
        any_data = true unless data.nil?
      end

      if any_data
        row_out['_col_id'] = n
        ds.add_row(row_out)
      end
    end
  end
  ds.update
  ds
end

#only_numerics(opts = {}) ⇒ Object

Return a DataFrame of only the numerical Vectors. If clone: false is specified as option, only a view of the Vectors will be returned. Defaults to clone: true.



1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
# File 'lib/daru/dataframe.rb', line 1384

def only_numerics opts={}
  cln = opts[:clone] == false ? false : true
  nv = numeric_vectors
  arry = nv.each_with_object([]) do |v, arr|
    arr << self[v]
  end

  order = Index.new(nv)
  Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
end

#pivot_table(opts = {}) ⇒ Object

Pivots a data frame on specified vectors and applies an aggregate function to quickly generate a summary.

Options

:index - Keys to group by on the pivot table row index. Pass vector names contained in an Array.

:vectors - Keys to group by on the pivot table column index. Pass vector names contained in an Array.

:agg - Function to aggregate the grouped values. Default to :mean. Can use any of the statistics functions applicable on Vectors that can be found in the Daru::Statistics::Vector module.

:values - Columns to aggregate. Will consider all numeric columns not specified in :index or :vectors. Optional.

Usage

df = Daru::DataFrame.new({
  a: ['foo'  ,  'foo',  'foo',  'foo',  'foo',  'bar',  'bar',  'bar',  'bar'],
  b: ['one'  ,  'one',  'one',  'two',  'two',  'one',  'one',  'two',  'two'],
  c: ['small','large','large','small','small','large','small','large','small'],
  d: [1,2,2,3,3,4,5,6,7],
  e: [2,4,4,6,6,8,10,12,14]
})
df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)

#=>
# #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
#            [:e, :one] [:e, :two]
#     [:bar]         18         26
#     [:foo]         10         12

Raises:

  • (ArgumentError)


1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
# File 'lib/daru/dataframe.rb', line 1570

def pivot_table opts={}
  raise ArgumentError,
    'Specify grouping index' if !opts[:index] || opts[:index].empty?

  index   = opts[:index]
  vectors = opts[:vectors] || []
  aggregate_function = opts[:agg] || :mean
  values =
    if opts[:values].is_a?(Symbol)
      [opts[:values]]
    elsif opts[:values].is_a?(Array)
      opts[:values]
    else # nil
      (@vectors.to_a - (index | vectors)) & numeric_vector_names
    end

  raise IndexError, 'No numeric vectors to aggregate' if values.empty?

  grouped = group_by(index)

  if vectors.empty?
    grouped.send(aggregate_function)
  else
    super_hash = {}
    values.each do |value|
      grouped.groups.each do |group_name, row_numbers|
        super_hash[group_name] ||= {}

        row_numbers.each do |num|
          arry = []
          arry << value
          vectors.each { |v| arry << self[v][num] }
          sub_hash = super_hash[group_name]
          sub_hash[arry] ||= []

          sub_hash[arry] << self[value][num]
        end
      end
    end

    super_hash.each_value do |sub_hash|
      sub_hash.each do |group_name, aggregates|
        sub_hash[group_name] = Daru::Vector.new(aggregates).send(aggregate_function)
      end
    end

    df_index = Daru::MultiIndex.from_tuples super_hash.keys

    vector_indexes = []
    super_hash.each_value do |sub_hash|
      vector_indexes.concat sub_hash.keys
    end

    df_vectors = Daru::MultiIndex.from_tuples vector_indexes.uniq
    pivoted_dataframe = Daru::DataFrame.new({}, index: df_index, order: df_vectors)

    super_hash.each do |row_index, sub_h|
      sub_h.each do |vector_index, val|
        # pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val
        pivoted_dataframe[vector_index][row_index] = val
      end
    end
    return pivoted_dataframe
  end
end

#recast(opts = {}) ⇒ Object

Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype

Usage

df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
df.recast a: :nmatrix, c: :nmatrix


1998
1999
2000
2001
2002
# File 'lib/daru/dataframe.rb', line 1998

def recast opts={}
  opts.each do |vector_name, dtype|
    self[vector_name].cast(dtype: dtype)
  end
end

#recode(axis = :vector, &block) ⇒ Object

Maps over the DataFrame and returns a DataFrame. Each run of the block must return a Daru::Vector object. You can specify the axis to map over. Default to :vector.

Description

Recode works similarly to #map, but an important difference between the two is that recode returns a modified Daru::DataFrame instead of an Array. For this reason, #recode expects that every run of the block to return a Daru::Vector.

Just like map and each, recode also accepts an optional axis argument.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



636
637
638
639
640
641
642
# File 'lib/daru/dataframe.rb', line 636

def recode axis=:vector, &block
  if axis == :vector || axis == :column
    recode_vectors(&block)
  elsif axis == :row
    recode_rows(&block)
  end
end

#recode_rowsObject



694
695
696
697
698
699
700
701
702
703
704
705
# File 'lib/daru/dataframe.rb', line 694

def recode_rows
  block_given? or return to_enum(:recode_rows)

  df = dup
  df.each_row_with_index do |r, i|
    ret = yield r
    ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
    df.row[i] = ret
  end

  df
end

#recode_vectorsObject



680
681
682
683
684
685
686
687
688
689
690
691
692
# File 'lib/daru/dataframe.rb', line 680

def recode_vectors
  block_given? or return to_enum(:recode_vectors)

  df = dup
  df.each_vector_with_index do |v, i|
    ret = yield v
    ret.is_a?(Daru::Vector) or
      raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
    df[*i] = ret
  end

  df
end

#reindex(new_index) ⇒ Object

Change the index of the DataFrame and preserve the labels of the previous indexing. New index can be Daru::Index or any of its subclasses.

Examples:

Reindexing DataFrame

df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
  index: ['a','b','c','d'])
#=>
##<Daru::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
#                    a          b
#         a          1         11
#         b          2         22
#         c          3         33
#         d          4         44
df.reindex Daru::Index.new(['b', 0, 'a', 'g'])
#=>
##<Daru::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
#                    a          b
#         b          2         22
#         0        nil        nil
#         a          1         11
#         g        nil        nil

Parameters:

  • new_index (Daru::Index)

    The new Index for reindexing the DataFrame.

Raises:

  • (ArgumentError)


1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
# File 'lib/daru/dataframe.rb', line 1289

def reindex new_index
  raise ArgumentError, 'Must pass the new index of type Index or its '\
    "subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)

  cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
  new_index.each do |idx|
    cl.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
  end

  cl
end

#reindex_vectors(new_vectors) ⇒ Object

Raises:

  • (ArgumentError)


1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
# File 'lib/daru/dataframe.rb', line 1224

def reindex_vectors new_vectors
  raise ArgumentError, 'Must pass the new index of type Index or its '\
    "subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)

  cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
  new_vectors.each do |vec|
    cl[vec] = @vectors.include?(vec) ? self[vec] : cl[vec] = [nil]*nrows
  end

  cl
end

#rename(new_name) ⇒ Object

Rename the DataFrame.



1926
1927
1928
# File 'lib/daru/dataframe.rb', line 1926

def rename new_name
  @name = new_name
end

#rename_vectors(name_map) ⇒ Object

Renames the vectors

Arguments

  • name_map - A hash where the keys are the exising vector names and

    the values are the new names.  If a vector is renamed
    to a vector name that is already in use, the existing
    one is overwritten.
    

Usage

df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
df.rename_vectors :a => :alpha, :c => :gamma
df.vectors.to_a #=> [:alpha, :b, :gamma]


1353
1354
1355
1356
1357
1358
1359
# File 'lib/daru/dataframe.rb', line 1353

def rename_vectors name_map
  existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a
  delete_vectors(*existing_targets)

  new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
  self.vectors = Daru::Index.new new_names
end

#report_building(b) ⇒ Object

:nodoc: #



1400
1401
1402
1403
1404
1405
1406
1407
1408
# File 'lib/daru/dataframe.rb', line 1400

def report_building(b) # :nodoc: #
  b.section(name: @name) do |g|
    g.text "Number of rows: #{nrows}"
    @vectors.each do |v|
      g.text "Element:[#{v}]"
      g.parse_element(self[v])
    end
  end
end

#rowObject

Access a row or set/create a row. Refer #[] and #[]= docs for details.

Usage

df.row[:a] # access row named ':a'
df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]


411
412
413
# File 'lib/daru/dataframe.rb', line 411

def row
  Daru::Accessors::DataFrameByRow.new(self)
end

#save(filename) ⇒ Object

Use marshalling to save dataframe to a file.



1972
1973
1974
# File 'lib/daru/dataframe.rb', line 1972

def save filename
  Daru::IO.save self, filename
end

#set_index(new_index, opts = {}) ⇒ Object

Set a particular column as the new DF

Raises:

  • (ArgumentError)


1257
1258
1259
1260
1261
1262
1263
1264
1265
# File 'lib/daru/dataframe.rb', line 1257

def set_index new_index, opts={}
  raise ArgumentError, 'All elements in new index must be unique.' if
    @size != self[new_index].uniq.size

  self.index = Daru::Index.new(self[new_index].to_a)
  delete_vector(new_index) unless opts[:keep]

  self
end

#shapeObject

Return the number of rows and columns of the DataFrame in an Array.



1081
1082
1083
# File 'lib/daru/dataframe.rb', line 1081

def shape
  [@index.size, @vectors.size]
end

#sort(vector_order, opts = {}) ⇒ Object

Non-destructive version of #sort!



1532
1533
1534
# File 'lib/daru/dataframe.rb', line 1532

def sort vector_order, opts={}
  dup.sort! vector_order, opts
end

#sort!(vector_order, opts = {}) ⇒ Object

Sorts a dataframe (ascending/descending) in the given pripority sequence of vectors, with or without a block.

Examples:

Sort a dataframe with a vector sequence.


df = Daru::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})

df.sort [:a, :b]
# =>
# <Daru::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
#                   a          b
#        2          1          3
#        0          1          5
#        3          2          2
#        1          2          4
#        4          3          1

Sort a dataframe without a block. Here nils will be handled automatically.


df = Daru::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})

df.sort([:a])
# =>
# <Daru::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
#                    a          b
#         1        nil          3
#         3        nil          1
#         0         -3          4
#         2         -1          2
#         4          5          4

Sort a dataframe with a block with nils handled automatically.


df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })

df.sort [:b], by: {b: lambda { |a| a.length } }
# NoMethodError: undefined method `length' for nil:NilClass
# from (pry):8:in `block in __pry__'

df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true

# =>
# <Daru::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
#                    a          b
#         2          1        nil
#         5          1        nil
#         4         -1          x
#         1         -1         aa
#         0        nil        aaa
#         3        nil       baaa

Sort a dataframe with a block with nils handled manually.


df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })

# To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true

# =>
#<Daru::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
#                 a          b
#      4         -1          x
#      1         -1         aa
#      0        nil        aaa
#      3        nil       baaa
#      2          1        nil
#      5          1        nil

Parameters:

  • order (Array)

    The order of vector names in which the DataFrame should be sorted.

  • opts (Hash) (defaults to: {})

    The options to sort with.

Options Hash (opts):

  • :ascending (TrueClass, FalseClass, Array) — default: true

    Sort in ascending or descending order. Specify Array corresponding to order for multiple sort orders.

  • :by (Hash) — default: lambda{|a| a }

    Specify attributes of objects to to be used for sorting, for each vector name in order as a hash of vector name and lambda expressions. In case a lambda for a vector is not specified, the default will be used.

  • :handle_nils (TrueClass, FalseClass, Array) — default: false

    Handle nils automatically or not when a block is provided. If set to True, nils will appear at top after sorting.

Raises:

  • (ArgumentError)


1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
# File 'lib/daru/dataframe.rb', line 1493

def sort! vector_order, opts={}
  raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
  opts = {
    ascending: true,
    handle_nils: false,
    by: {}
  }.merge(opts)

  opts[:ascending] = sort_order_array vector_order, opts[:ascending]
  opts[:handle_nils] = handle_nils_array vector_order, opts[:handle_nils]
  blocks = create_logic_blocks vector_order, opts[:by], opts[:ascending]

  block = lambda do |r1, r2|
    # Build left and right array to compare two rows
    left = build_array_from_blocks vector_order, opts, blocks, r1, r2
    right = build_array_from_blocks vector_order, opts, blocks, r2, r1

    # Resolve conflict by Index if all attributes are same
    left << r1
    right << r2
    left <=> right
  end

  idx = (0..@index.size-1).sort(&block)

  old_index = @index.to_a
  self.index = Daru::Index.new(idx.map { |i| old_index[i] })

  vectors.each do |v|
    @data[@vectors[v]] = Daru::Vector.new(
      idx.map { |i| @data[@vectors[v]].data[i] },
      name: self[v].name, metadata: self[v]..dup, index: index
    )
  end

  self
end

#summary(method = :to_text) ⇒ Object

Generate a summary of this DataFrame with ReportBuilder.



1396
1397
1398
# File 'lib/daru/dataframe.rb', line 1396

def summary(method=:to_text)
  ReportBuilder.new(no_title: true).add(self).send(method)
end

#tail(quantity = 10) ⇒ Object Also known as: last

The last ten elements of the DataFrame

Parameters:

  • quantity (Fixnum) (defaults to: 10)

    (10) The number of elements to display from the bottom.



1156
1157
1158
# File 'lib/daru/dataframe.rb', line 1156

def tail quantity=10
  self[(@size - quantity)..(@size-1), :row]
end

#to_aObject

Converts the DataFrame into an array of hashes where key is vector name and value is the corresponding element. The 0th index of the array contains the array of hashes while the 1th index contains the indexes of each row of the dataframe. Each element in the index array corresponds to its row in the array of hashes, which has the same index.



1839
1840
1841
1842
1843
1844
1845
1846
1847
# File 'lib/daru/dataframe.rb', line 1839

def to_a
  arry = [[],[]]
  each_row do |row|
    arry[0] << row.to_h
  end
  arry[1] = @index.to_a

  arry
end

#to_gslObject

Convert all numeric vectors to GSL::Matrix



1799
1800
1801
1802
1803
1804
1805
1806
# File 'lib/daru/dataframe.rb', line 1799

def to_gsl
  numerics_as_arrays = []
  numeric_vectors.each do |n|
    numerics_as_arrays << self[n].to_a
  end

  GSL::Matrix.alloc(*numerics_as_arrays.transpose)
end

#to_hObject

Converts DataFrame to a hash (explicit) with keys as vector names and values as the corresponding vectors.



1861
1862
1863
1864
1865
1866
1867
1868
# File 'lib/daru/dataframe.rb', line 1861

def to_h
  hsh = {}
  @vectors.each_with_index do |vec_name, idx|
    hsh[vec_name] = @data[idx]
  end

  hsh
end

#to_hashObject

NOTE: This alias will soon be removed. Use to_h in all future work.



99
# File 'lib/daru/monkeys.rb', line 99

alias :to_hash :to_h

#to_html(threshold = 30) ⇒ Object

Convert to html for IRuby.



1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
# File 'lib/daru/dataframe.rb', line 1871

def to_html threshold=30
  html = '<table>' \
    '<tr>' \
      "<th colspan=\"#{@vectors.size+1}\">" \
        "Daru::DataFrame:#{object_id} " + " rows: #{nrows} " + " cols: #{ncols}" \
      '</th>' \
    '</tr>'
  html +='<tr><th></th>'
  @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
  html += '</tr>'

  @index.each_with_index do |index, num|
    html += '<tr>'
    html += '<td>' + index.to_s + '</td>'

    row[index].each do |element|
      html += '<td>' + element.to_s + '</td>'
    end

    html += '</tr>'
    next if num <= threshold

    html += '<tr>'
    (@vectors.size + 1).times { html += '<td>...</td>' }
    html += '</tr>'

    last_index = @index.to_a.last
    last_row = row[last_index]
    html += '<tr>'
    html += '<td>' + last_index.to_s + '</td>'
    (0..(ncols - 1)).to_a.each do |i|
      html += '<td>' + last_row[i].to_s + '</td>'
    end
    html += '</tr>'
    break
  end
  html += '</table>'

  html
end

#to_json(no_index = true) ⇒ Object

Convert to json. If no_index is false then the index will NOT be included in the JSON thus created.



1851
1852
1853
1854
1855
1856
1857
# File 'lib/daru/dataframe.rb', line 1851

def to_json no_index=true
  if no_index
    to_a[0].to_json
  else
    to_a.to_json
  end
end

#to_matrixObject

Convert all vectors of type :numeric into a Matrix.



1809
1810
1811
1812
1813
1814
1815
1816
# File 'lib/daru/dataframe.rb', line 1809

def to_matrix
  numerics_as_arrays = []
  each_vector do |vector|
    numerics_as_arrays << vector.to_a if vector.type == :numeric
  end

  Matrix.columns numerics_as_arrays
end

#to_nmatrixObject

Convert all vectors of type :numeric and not containing nils into an NMatrix.



1824
1825
1826
1827
1828
1829
1830
1831
1832
# File 'lib/daru/dataframe.rb', line 1824

def to_nmatrix
  numerics_as_arrays = []
  each_vector do |vector|
    numerics_as_arrays << vector.to_a if vector.type == :numeric &&
                                         vector.missing_positions.empty?
  end

  numerics_as_arrays.transpose.to_nm
end

#to_nyaplotdfObject

Return a Nyaplot::DataFrame from the data of this DataFrame.



1819
1820
1821
# File 'lib/daru/dataframe.rb', line 1819

def to_nyaplotdf
  Nyaplot::DataFrame.new(to_a[0])
end

#to_REXPObject

rubocop:disable Style/MethodName



5
6
7
8
9
10
11
12
13
# File 'lib/daru/extensions/rserve.rb', line 5

def to_REXP # rubocop:disable Style/MethodName
  names = @vectors.to_a
  data  = names.map do |f|
    Rserve::REXP::Wrapper.wrap(self[f].to_a)
  end
  l = Rserve::Rlist.new(data, names.map(&:to_s))

  Rserve::REXP.create_data_frame(l)
end

#to_sObject



1912
1913
1914
# File 'lib/daru/dataframe.rb', line 1912

def to_s
  to_html
end

#transposeObject

Transpose a DataFrame, tranposing elements and row, column indexing.



2005
2006
2007
2008
2009
2010
2011
2012
# File 'lib/daru/dataframe.rb', line 2005

def transpose
  arrys = []
  each_vector do |vec|
    arrys << vec.to_a
  end

  Daru::DataFrame.new(arrys.transpose, index: @vectors, order: @index, dtype: @dtype, name: @name)
end

#updateObject

Method for updating the metadata (i.e. missing value positions) of the after assingment/deletion etc. are complete. This is provided so that time is not wasted in creating the metadata for the vector each time assignment/deletion of elements is done. Updating data this way is called lazy loading. To set or unset lazy loading, see the .lazy_update= method.



1921
1922
1923
# File 'lib/daru/dataframe.rb', line 1921

def update
  @data.each(&:update) if Daru.lazy_update
end

#vectorObject



344
345
346
347
# File 'lib/daru/dataframe.rb', line 344

def vector(*)
  $stderr.puts '#vector has been deprecated in favour of #[]. Please use that.'
  self[*names]
end

#vector_by_calculation(&block) ⇒ Object

DSL for yielding each row and returning a Daru::Vector based on the value each run of the block returns.

Usage

a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
total = ds.vector_by_calculation { a + b + c }
# <Daru::Vector:82314050 @name = nil @size = 7 >
#   nil
# 0 111
# 1 222
# 2 333
# 3 444
# 4 555
# 5 666
# 6 777


986
987
988
989
990
991
992
993
# File 'lib/daru/dataframe.rb', line 986

def vector_by_calculation &block
  a = []
  each_row do |r|
    a.push r.instance_eval(&block)
  end

  Daru::Vector.new a, index: @index
end

#vector_count_characters(vecs = nil) ⇒ Object



1065
1066
1067
1068
1069
1070
1071
1072
1073
# File 'lib/daru/dataframe.rb', line 1065

def vector_count_characters vecs=nil
  vecs ||= @vectors.to_a

  collect_rows do |row|
    vecs.inject(0) do |memo, vec|
      memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
    end
  end
end

#vector_mean(max_missing = 0) ⇒ Object

Calculate mean of the rows of the dataframe.

Arguments

  • max_missing - The maximum number of elements in the row that can be

zero for the mean calculation to happen. Default to 0.



1181
1182
1183
1184
1185
1186
1187
1188
1189
# File 'lib/daru/dataframe.rb', line 1181

def vector_mean max_missing=0
  mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"

  each_row_with_index do |row, i|
    mean_vec[i] = row.missing_positions.size > max_missing ? nil : row.mean
  end

  mean_vec
end

#vector_sum(vecs = nil) ⇒ Object

Returns a vector with sum of all vectors specified in the argument. Tf vecs parameter is empty, sum all numeric vector.



1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
# File 'lib/daru/dataframe.rb', line 1164

def vector_sum vecs=nil
  vecs ||= numeric_vectors
  sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype

  vecs.each do |n|
    sum += self[n]
  end

  sum
end

#verify(*tests) ⇒ Object

Test each row with one or more tests. Each test is a Proc with the form *Proc.new {|row| row > 0}*

The function returns an array with all errors.



943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
# File 'lib/daru/dataframe.rb', line 943

def verify(*tests)
  if tests[0].is_a? Symbol
    id = tests[0]
    tests.shift
  else
    id = @vectors.first
  end

  vr = []
  i  = 0
  each(:row) do |row|
    i += 1
    tests.each do |test|
      next if test[2].call(row)
      values = ''
      unless test[1].empty?
        values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
      end
      vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
    end
  end
  vr
end

#where(bool_array) ⇒ Object

Query a DataFrame by passing a Daru::Core::Query::BoolArray object.



2049
2050
2051
# File 'lib/daru/dataframe.rb', line 2049

def where bool_array
  Daru::Core::Query.df_where self, bool_array
end

#write_csv(filename, opts = {}) ⇒ Object

Write this DataFrame to a CSV file.

Arguements

  • filename - Path of CSV file where the DataFrame is to be saved.

Options

  • convert_comma - If set to true, will convert any commas in any

of the data to full stops ('.'). All the options accepted by CSV.read() can also be passed into this function.



1942
1943
1944
# File 'lib/daru/dataframe.rb', line 1942

def write_csv filename, opts={}
  Daru::IO.dataframe_write_csv self, filename, opts
end

#write_excel(filename, opts = {}) ⇒ Object

Write this dataframe to an Excel Spreadsheet

Arguments

  • filename - The path of the file where the DataFrame should be written.



1951
1952
1953
# File 'lib/daru/dataframe.rb', line 1951

def write_excel filename, opts={}
  Daru::IO.dataframe_write_excel self, filename, opts
end

#write_sql(dbh, table) ⇒ Object

Insert each case of the Dataset on the selected table

Arguments

  • dbh - DBI database connection object.

  • query - Query string.

Usage

ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
ds.write_sql(dbh,"test")


1967
1968
1969
# File 'lib/daru/dataframe.rb', line 1967

def write_sql dbh, table
  Daru::IO.dataframe_write_sql self, dbh, table
end