Class: Daru::DataFrame

Inherits:
Object show all
Extended by:
Gem::Deprecate
Includes:
Maths::Arithmetic::DataFrame, Maths::Statistics::DataFrame, Plotting::DataFrame::NyaplotLibrary
Defined in:
lib/daru/dataframe.rb,
lib/daru/extensions/rserve.rb

Overview

rubocop:disable Metrics/ClassLength

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Plotting::DataFrame::NyaplotLibrary

#plot

Methods included from Maths::Statistics::DataFrame

#acf, #correlation, #count, #covariance, #cumsum, #describe, #ema, #max, #mean, #median, #min, #mode, #percent_change, #product, #range, #rolling_count, #rolling_max, #rolling_mean, #rolling_median, #rolling_min, #rolling_std, #rolling_variance, #standardize, #std, #sum, #variance_sample

Methods included from Maths::Arithmetic::DataFrame

#%, #*, #**, #+, #-, #/, #exp, #round, #sqrt

Constructor Details

#initialize(source, opts = {}) ⇒ DataFrame

DataFrame basically consists of an Array of Vector objects. These objects are indexed by row and column by vectors and index Index objects.

Arguments

  • source - Source from the DataFrame is to be initialized. Can be a Hash

of names and vectors (array or Daru::Vector), an array of arrays or array of Daru::Vectors.

Options

:order - An Array/Daru::Index/Daru::MultiIndex containing the order in which Vectors should appear in the DataFrame.

:index - An Array/Daru::Index/Daru::MultiIndex containing the order in which rows of the DataFrame will be named.

:name - A name for the DataFrame.

:clone - Specify as true or false. When set to false, and Vector objects are passed for the source, the Vector objects will not duplicated when creating the DataFrame. Will have no effect if Array is passed in the source, or if the passed Daru::Vectors have different indexes. Default to true.

Usage

df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
  index: [:a, :b, :c, :d], name: :spider_man)

# =>
# <Daru::DataFrame:80766980 @name = spider_man @size = 4>
#             b          a
#  a          6          1
#  b          7          2
#  c          8          3
#  d          9          4

df = Daru::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)

# =>
# #<Daru::DataFrame: bat_man (4x2)>
#             0          1
#  0          1          6
#  1          2          7
#  2          3          8
#  3          4          9

# Dataframe having Index name

df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
  index: Daru::Index.new([:a, :b, :c, :d], name: 'idx_name'),
  name: :spider_man)

# =>
# <Daru::DataFrame:80766980 @name = spider_man @size = 4>
# idx_name            b          a
#        a          6          1
#        b          7          2
#        c          8          3
#        d          9          4

idx = Daru::Index.new [100, 99, 101, 1, 2], name: "s1"
=> #<Daru::Index(5): s1 {100, 99, 101, 1, 2}>

df = Daru::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
  c: [11,22,33,44,55]},
  order: [:a, :b, :c],
  index: idx)
 # =>
 #<Daru::DataFrame(5x3)>
 #   s1   a   b   c
 #  100   1  11  11
 #   99   2  12  22
 #  101   3  13  33
 #    1   4  14  44
 #    2   5  15  55


332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# File 'lib/daru/dataframe.rb', line 332

def initialize source, opts={} # rubocop:disable Metrics/MethodLength
  vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
  @data = []
  @name = opts[:name]

  case source
  when ->(s) { s.empty? }
    @vectors = Index.coerce vectors
    @index   = Index.coerce index
    create_empty_vectors
  when Array
    initialize_from_array source, vectors, index, opts
  when Hash
    initialize_from_hash source, vectors, index, opts
  end

  set_size
  validate
  update
  self.plotting_library = Daru.plotting_library
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args, &block) ⇒ Object



2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
# File 'lib/daru/dataframe.rb', line 2054

def method_missing(name, *args, &block)
  case
  when name =~ /(.+)\=/
    name = name[/(.+)\=/].delete('=')
    name = name.to_sym unless has_vector?(name)
    insert_or_modify_vector [name], args[0]
  when has_vector?(name)
    self[name]
  when has_vector?(name.to_s)
    self[name.to_s]
  else
    super
  end
end

Instance Attribute Details

#dataObject (readonly)

TOREMOVE



243
244
245
# File 'lib/daru/dataframe.rb', line 243

def data
  @data
end

#indexObject

The index of the rows of the DataFrame



246
247
248
# File 'lib/daru/dataframe.rb', line 246

def index
  @index
end

#nameObject (readonly)

The name of the DataFrame



249
250
251
# File 'lib/daru/dataframe.rb', line 249

def name
  @name
end

#sizeObject (readonly)

The number of rows present in the DataFrame



252
253
254
# File 'lib/daru/dataframe.rb', line 252

def size
  @size
end

#vectorsObject

The vectors (columns) index of the DataFrame



241
242
243
# File 'lib/daru/dataframe.rb', line 241

def vectors
  @vectors
end

Class Method Details

._load(data) ⇒ Object



1980
1981
1982
1983
1984
1985
1986
# File 'lib/daru/dataframe.rb', line 1980

def self._load data
  h = Marshal.load data
  Daru::DataFrame.new(h[:data],
    index: h[:index],
    order: h[:order],
    name:  h[:name])
end

.crosstab_by_assignation(rows, columns, values) ⇒ Object

Generates a new dataset, using three vectors

  • Rows

  • Columns

  • Values

For example, you have these values

x   y   v
a   a   0
a   b   1
b   a   1
b   b   0

You obtain

id  a   b
 a  0   1
 b  1   0

Useful to process outputs from databases



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/daru/dataframe.rb', line 199

def crosstab_by_assignation rows, columns, values
  raise 'Three vectors should be equal size' if
    rows.size != columns.size || rows.size!=values.size

  data = Hash.new { |h, col|
    h[col] = rows.factors.map { |r| [r, nil] }.to_h
  }
  columns.zip(rows, values).each { |c, r, v| data[c][r] = v }

  # FIXME: in fact, WITHOUT this line you'll obtain more "right"
  # data: with vectors having "rows" as an index...
  data = data.map { |c, r| [c, r.values] }.to_h
  data[:_id] = rows.factors

  DataFrame.new(data)
end

.from_activerecord(relation, *fields) ⇒ Object

Read a dataframe from AR::Relation

USE:

# When Post model is defined as:
class Post < ActiveRecord::Base
  scope :active, -> { where.not(published_at: nil) }
end

# You can load active posts into a dataframe by:
Daru::DataFrame.from_activerecord(Post.active, :title, :published_at)

Parameters:

  • relation (ActiveRecord::Relation)

    An AR::Relation object from which data is loaded

Returns:

  • A dataframe containing the data loaded from the relation



100
101
102
# File 'lib/daru/dataframe.rb', line 100

def from_activerecord relation, *fields
  Daru::IO.from_activerecord relation, *fields
end

.from_csv(path, opts = {}, &block) ⇒ Object

Load data from a CSV file. Specify an optional block to grab the CSV object and pre-condition it (for example use the ‘convert` or `header_convert` methods).

Arguments

  • path - Local path / Remote URL of the file to load specified as a String.

Options

Accepts the same options as the Daru::DataFrame constructor and CSV.open() and uses those to eventually construct the resulting DataFrame.

Verbose Description

You can specify all the options to the ‘.from_csv` function that you do to the Ruby `CSV.read()` function, since this is what is used internally.

For example, if the columns in your CSV file are separated by something other that commas, you can use the ‘:col_sep` option. If you want to convert numeric values to numbers and not keep them as strings, you can use the `:converters` option and set it to `:numeric`.

The ‘.from_csv` function uses the following defaults for reading CSV files (that are passed into the `CSV.read()` function):

{
  :col_sep           => ',',
  :converters        => :numeric
}


47
48
49
# File 'lib/daru/dataframe.rb', line 47

def from_csv path, opts={}, &block
  Daru::IO.from_csv path, opts, &block
end

.from_excel(path, opts = {}, &block) ⇒ Object

Read data from an Excel file into a DataFrame.

Arguments

  • path - Path of the file to be read.

Options

*:worksheet_id - ID of the worksheet that is to be read.



60
61
62
# File 'lib/daru/dataframe.rb', line 60

def from_excel path, opts={}, &block
  Daru::IO.from_excel path, opts, &block
end

.from_html(path, fields = {}) ⇒ Object

Read the table data from a remote html file. Please note that this module works only for static table elements on a HTML page, and won’t work in cases where the data is being loaded into the HTML table by Javascript.

By default - all <th> tag elements in the first proper row are considered as the order, and all the <th> tag elements in the first column are considered as the index.

Arguments

  • path [String] - URL of the target HTML file.

  • fields [Hash] -

    :match - A String to match and choose a particular table(s) from multiple tables of a HTML page.

    :order - An Array which would act as the user-defined order, to override the parsed Daru::DataFrame.

    :index - An Array which would act as the user-defined index, to override the parsed Daru::DataFrame.

    :name - A String that manually assigns a name to the scraped Daru::DataFrame, for user’s preference.

Returns

An Array of Daru::DataFrames, with each dataframe corresponding to a HTML table on that webpage.

Usage

dfs = Daru::DataFrame.from_html("http://www.moneycontrol.com/", match: "Sun Pharma")
dfs.count
# => 4

dfs.first
#
# => <Daru::DataFrame(5x4)>
#          Company      Price     Change Value (Rs
#     0 Sun Pharma     502.60     -65.05   2,117.87
#     1   Reliance    1356.90      19.60     745.10
#     2 Tech Mahin     379.45     -49.70     650.22
#     3        ITC     315.85       6.75     621.12
#     4       HDFC    1598.85      50.95     553.91


159
160
161
# File 'lib/daru/dataframe.rb', line 159

def from_html path, fields={}
  Daru::IO.from_html path, fields
end

.from_plaintext(path, fields) ⇒ Object

Read the database from a plaintext file. For this method to work, the data should be present in a plain text file in columns. See spec/fixtures/bank2.dat for an example.

Arguments

  • path - Path of the file to be read.

  • fields - Vector names of the resulting database.

Usage

df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]


116
117
118
# File 'lib/daru/dataframe.rb', line 116

def from_plaintext path, fields
  Daru::IO.from_plaintext path, fields
end

.from_sql(dbh, query) ⇒ Object

Read a database query and returns a Dataset

USE:

dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")

#Alternatively

require 'dbi'
Daru::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")

Parameters:

  • dbh (DBI::DatabaseHandle, String)

    A DBI connection OR Path to a SQlite3 database.

  • query (String)

    The query to be executed

Returns:

  • A dataframe containing the data resulting from the query



80
81
82
# File 'lib/daru/dataframe.rb', line 80

def from_sql dbh, query
  Daru::IO.from_sql dbh, query
end

.rows(source, opts = {}) ⇒ Object

Create DataFrame by specifying rows as an Array of Arrays or Array of Daru::Vector objects.

Raises:



165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/daru/dataframe.rb', line 165

def rows source, opts={}
  raise SizeError, 'All vectors must have same length' \
    unless source.all? { |v| v.size == source.first.size }

  opts[:order] ||= guess_order(source)

  if ArrayHelper.array_of?(source, Array) || source.empty?
    DataFrame.new(source.transpose, opts)
  elsif ArrayHelper.array_of?(source, Vector)
    from_vector_rows(source, opts)
  else
    raise ArgumentError, "Can't create DataFrame from #{source}"
  end
end

Instance Method Details

#==(other) ⇒ Object



2029
2030
2031
2032
2033
2034
2035
# File 'lib/daru/dataframe.rb', line 2029

def == other
  self.class == other.class   &&
    @size    == other.size    &&
    @index   == other.index   &&
    @vectors == other.vectors &&
    @vectors.to_a.all? { |v| self[v] == other[v] }
end

#[](*names) ⇒ Object

Access row or vector. Specify name of row/vector followed by axis(:row, :vector). Defaults to :vector. Use of this method is not recommended for accessing rows. Use df.row for accessing row with index ‘:a’.



372
373
374
375
# File 'lib/daru/dataframe.rb', line 372

def [](*names)
  axis = extract_axis(names, :vector)
  dispatch_to_axis axis, :access, *names
end

#[]=(*args) ⇒ Object

Insert a new row/vector of the specified name or modify a previous row. Instead of using this method directly, use df.row = [1,2,3] to set/create a row ‘:a’ to [1,2,3], or df.vector = [1,2,3] for vectors.

In case a Daru::Vector is specified after the equality the sign, the indexes of the vector will be matched against the row/vector indexes of the DataFrame before an insertion is performed. Unmatched indexes will be set to nil.



516
517
518
519
520
521
522
# File 'lib/daru/dataframe.rb', line 516

def []=(*args)
  vector = args.pop
  axis = extract_axis(args)
  names = args

  dispatch_to_axis axis, :insert_or_modify, names, vector
end

#_dump(_depth) ⇒ Object



1971
1972
1973
1974
1975
1976
1977
1978
# File 'lib/daru/dataframe.rb', line 1971

def _dump(_depth)
  Marshal.dump(
    data:  @data,
    index: @index.to_a,
    order: @vectors.to_a,
    name:  @name
  )
end

#access_row_tuples_by_indexs(*indexes) ⇒ Object

returns array of row tuples at given index(s)



2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
# File 'lib/daru/dataframe.rb', line 2114

def access_row_tuples_by_indexs *indexes
  positions = @index.pos(*indexes)

  return populate_row_for(positions) if positions.is_a? Numeric

  res = []
  new_rows = @data.map { |vec| vec[*indexes] }
  indexes.each do |index|
    tuples = []
    new_rows.map { |row| tuples += [row[index]] }
    res << tuples
  end
  res
end

#add_row(row, index = nil) ⇒ Object



524
525
526
# File 'lib/daru/dataframe.rb', line 524

def add_row row, index=nil
  self.row[index || @size] = row
end

#add_vector(n, vector) ⇒ Object



528
529
530
# File 'lib/daru/dataframe.rb', line 528

def add_vector n, vector
  self[n] = vector
end

#add_vectors_by_split(name, join = '-', sep = Daru::SPLIT_TOKEN) ⇒ Object



1176
1177
1178
1179
1180
# File 'lib/daru/dataframe.rb', line 1176

def add_vectors_by_split(name,join='-',sep=Daru::SPLIT_TOKEN)
  self[name]
    .split_by_separator(sep)
    .each { |k,v| self["#{name}#{join}#{k}".to_sym] = v }
end

#add_vectors_by_split_recode(nm, join = '-', sep = Daru::SPLIT_TOKEN) ⇒ Object



1777
1778
1779
1780
1781
1782
1783
1784
# File 'lib/daru/dataframe.rb', line 1777

def add_vectors_by_split_recode(nm, join='-', sep=Daru::SPLIT_TOKEN)
  self[nm]
    .split_by_separator(sep)
    .each_with_index do |(k, v), i|
      v.rename "#{nm}:#{k}"
      self["#{nm}#{join}#{i + 1}".to_sym] = v
    end
end

#all?(axis = :vector, &block) ⇒ Boolean

Works like Array#all?

Examples:

Using all?

df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
df.all?(:row) do |row|
  row[:a] < 10
end #=> true

Parameters:

  • axis (Symbol) (defaults to: :vector)

    (:vector) The axis to iterate over. Can be :vector or :row. A Daru::Vector object is yielded in the block.

Returns:

  • (Boolean)


1233
1234
1235
1236
1237
1238
1239
1240
1241
# File 'lib/daru/dataframe.rb', line 1233

def all? axis=:vector, &block
  if %i[vector column].include?(axis)
    @data.all?(&block)
  elsif axis == :row
    each_row.all?(&block)
  else
    raise ArgumentError, "Unidentified axis #{axis}"
  end
end

#any?(axis = :vector, &block) ⇒ Boolean

Works like Array#any?.

Examples:

Using any?

df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
df.any?(:row) do |row|
  row[:a] < 3 and row[:b] == 'b'
end #=> true

Parameters:

  • axis (Symbol) (defaults to: :vector)

    (:vector) The axis to iterate over. Can be :vector or :row. A Daru::Vector object is yielded in the block.

Returns:

  • (Boolean)


1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
# File 'lib/daru/dataframe.rb', line 1211

def any? axis=:vector, &block
  if %i[vector column].include?(axis)
    @data.any?(&block)
  elsif axis == :row
    each_row do |row|
      return true if yield(row)
    end
    false
  else
    raise ArgumentError, "Unidentified axis #{axis}"
  end
end

#at(*positions) ⇒ Daru::Vector, Daru::DataFrame

Retrive vectors by positions

Examples:

df = Daru::DataFrame.new({
  a: [1, 2, 3],
  b: ['a', 'b', 'c']
})
df.at 0
# => #<Daru::Vector(3)>
#       a
#   0   1
#   1   2
#   2   3

Parameters:

  • *positions (Array<Integer>)

    positions of vectors to retrive

Returns:



454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
# File 'lib/daru/dataframe.rb', line 454

def at *positions
  if AXES.include? positions.last
    axis = positions.pop
    return row_at(*positions) if axis == :row
  end

  original_positions = positions
  positions = coerce_positions(*positions, ncols)
  validate_positions(*positions, ncols)

  if positions.is_a? Integer
    @data[positions].dup
  else
    Daru::DataFrame.new positions.map { |pos| @data[pos].dup },
      index: @index,
      order: @vectors.at(*original_positions),
      name: @name
  end
end

#bootstrap(n = nil) ⇒ Daru::DataFrame

Creates a DataFrame with the random data, of n size. If n not given, uses original number of rows.

Returns:



982
983
984
985
986
987
988
989
990
# File 'lib/daru/dataframe.rb', line 982

def bootstrap(n=nil)
  n ||= nrows
  Daru::DataFrame.new({}, order: @vectors).tap do |df_boot|
    n.times do
      df_boot.add_row(row[rand(n)])
    end
    df_boot.update
  end
end

#clone(*vectors_to_clone) ⇒ Object

Returns a ‘view’ of the DataFrame, i.e the object ID’s of vectors are preserved.

Arguments

vectors_to_clone - Names of vectors to clone. Optional. Will return a view of the whole data frame otherwise.



568
569
570
571
572
573
574
# File 'lib/daru/dataframe.rb', line 568

def clone *vectors_to_clone
  vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
  vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?

  h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
  Daru::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
end

#clone_only_validObject

Returns a ‘shallow’ copy of DataFrame if missing data is not present, or a full copy of only valid data if missing data is present.



578
579
580
581
582
583
584
# File 'lib/daru/dataframe.rb', line 578

def clone_only_valid
  if include_values?(*Daru::MISSING_VALUES)
    reject_values(*Daru::MISSING_VALUES)
  else
    clone
  end
end

#clone_structureObject

Only clone the structure of the DataFrame.



557
558
559
# File 'lib/daru/dataframe.rb', line 557

def clone_structure
  Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
end

#collect(axis = :vector, &block) ⇒ Object

Iterate over a row or vector and return results in a Daru::Vector. Specify axis with :vector or :row. Default to :vector.

Description

The #collect iterator works similar to #map, the only difference being that it returns a Daru::Vector comprising of the results of each block run. The resultant Vector has the same index as that of the axis over which collect has iterated. It also accepts the optional axis argument.

Arguments

  • axis - The axis to iterate over. Can be :vector (or :column)

or :row. Default to :vector.



739
740
741
# File 'lib/daru/dataframe.rb', line 739

def collect axis=:vector, &block
  dispatch_to_axis_pl axis, :collect, &block
end

#collect_matrix::Matrix

Generate a matrix, based on vector names of the DataFrame.

:nocov: FIXME: Even not trying to cover this: I can’t get, how it is expected to work.… – zverok

Returns:



934
935
936
937
938
939
940
941
942
943
944
945
# File 'lib/daru/dataframe.rb', line 934

def collect_matrix
  return to_enum(:collect_matrix) unless block_given?

  vecs = vectors.to_a
  rows = vecs.collect { |row|
    vecs.collect { |col|
      yield row,col
    }
  }

  Matrix.rows(rows)
end

#collect_row_with_index(&block) ⇒ Object



908
909
910
911
912
# File 'lib/daru/dataframe.rb', line 908

def collect_row_with_index &block
  return to_enum(:collect_row_with_index) unless block_given?

  Daru::Vector.new(each_row_with_index.map(&block), index: @index)
end

#collect_rows(&block) ⇒ Object

Retrieves a Daru::Vector, based on the result of calculation performed on each row.



902
903
904
905
906
# File 'lib/daru/dataframe.rb', line 902

def collect_rows &block
  return to_enum(:collect_rows) unless block_given?

  Daru::Vector.new(each_row.map(&block), index: @index)
end

#collect_vector_with_index(&block) ⇒ Object



922
923
924
925
926
# File 'lib/daru/dataframe.rb', line 922

def collect_vector_with_index &block
  return to_enum(:collect_vector_with_index) unless block_given?

  Daru::Vector.new(each_vector_with_index.map(&block), index: @vectors)
end

#collect_vectors(&block) ⇒ Object

Retrives a Daru::Vector, based on the result of calculation performed on each vector.



916
917
918
919
920
# File 'lib/daru/dataframe.rb', line 916

def collect_vectors &block
  return to_enum(:collect_vectors) unless block_given?

  Daru::Vector.new(each_vector.map(&block), index: @vectors)
end

#compute(text, &block) ⇒ Object

Returns a vector, based on a string with a calculation based on vector.

The calculation will be eval’ed, so you can put any variable or expression valid on ruby.

For example:

a = Daru::Vector.new [1,2]
b = Daru::Vector.new [3,4]
ds = Daru::DataFrame.new({:a => a,:b => b})
ds.compute("a+b")
=> Vector [4,6]


1101
1102
1103
1104
# File 'lib/daru/dataframe.rb', line 1101

def compute text, &block
  return instance_eval(&block) if block_given?
  instance_eval(text)
end

#concat(other_df) ⇒ Object

Concatenate another DataFrame along corresponding columns. If columns do not exist in both dataframes, they are filled with nils



1342
1343
1344
1345
1346
1347
1348
1349
1350
# File 'lib/daru/dataframe.rb', line 1342

def concat other_df
  vectors = (@vectors.to_a + other_df.vectors.to_a).uniq

  data = vectors.map do |v|
    get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
  end

  Daru::DataFrame.new(data, order: vectors)
end

#create_sql(table, charset = 'UTF8') ⇒ Object

Create a sql, basen on a given Dataset

Arguments

  • table - String specifying name of the table that will created in SQL.

  • charset - Character set. Default is “UTF8”.

Examples:


ds = Daru::DataFrame.new({
 :id   => Daru::Vector.new([1,2,3,4,5]),
 :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
})
ds.create_sql('names')
 #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"


1802
1803
1804
1805
1806
1807
1808
1809
1810
# File 'lib/daru/dataframe.rb', line 1802

def create_sql(table,charset='UTF8')
  sql    = "CREATE TABLE #{table} ("
  fields = vectors.to_a.collect do |f|
    v = self[f]
    f.to_s + ' ' + v.db_type
  end

  sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
end

#delete_row(index) ⇒ Object

Delete a row

Raises:

  • (IndexError)


966
967
968
969
970
971
972
973
974
975
976
# File 'lib/daru/dataframe.rb', line 966

def delete_row index
  idx = named_index_for index

  raise IndexError, "Index #{index} does not exist." unless @index.include? idx
  @index = Daru::Index.new(@index.to_a - [idx])
  each_vector do |vector|
    vector.delete_at idx
  end

  set_size
end

#delete_vector(vector) ⇒ Object

Delete a vector

Raises:

  • (IndexError)


949
950
951
952
953
954
955
956
# File 'lib/daru/dataframe.rb', line 949

def delete_vector vector
  raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)

  @data.delete_at @vectors[vector]
  @vectors = Daru::Index.new @vectors.to_a - [vector]

  self
end

#delete_vectors(*vectors) ⇒ Object

Deletes a list of vectors



959
960
961
962
963
# File 'lib/daru/dataframe.rb', line 959

def delete_vectors *vectors
  Array(vectors).each { |vec| delete_vector vec }

  self
end

#dup(vectors_to_dup = nil) ⇒ Object

Duplicate the DataFrame entirely.

Arguments

  • vectors_to_dup - An Array specifying the names of Vectors to

be duplicated. Will duplicate the entire DataFrame if not specified.



547
548
549
550
551
552
553
554
# File 'lib/daru/dataframe.rb', line 547

def dup vectors_to_dup=nil
  vectors_to_dup = @vectors.to_a unless vectors_to_dup

  src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
  new_order = Daru::Index.new(vectors_to_dup)

  Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
end

#dup_only_valid(vecs = nil) ⇒ Object

Creates a new duplicate dataframe containing only rows without a single missing value.



588
589
590
591
592
593
594
595
# File 'lib/daru/dataframe.rb', line 588

def dup_only_valid vecs=nil
  rows_with_nil = @data.map { |vec| vec.indexes(*Daru::MISSING_VALUES) }
                       .inject(&:concat)
                       .uniq

  row_indexes = @index.to_a
  (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
end

#each(axis = :vector, &block) ⇒ Object

Iterate over each row or vector of the DataFrame. Specify axis by passing :vector or :row as the argument. Default to :vector.

Description

‘#each` works exactly like Array#each. The default mode for `each` is to iterate over the columns of the DataFrame. To iterate over rows you must pass the axis, i.e `:row` as an argument.

Arguments

  • axis - The axis to iterate over. Can be :vector (or :column)

or :row. Default to :vector.



720
721
722
# File 'lib/daru/dataframe.rb', line 720

def each axis=:vector, &block
  dispatch_to_axis axis, :each, &block
end

#each_index(&block) ⇒ Object

Iterate over each index of the DataFrame.



654
655
656
657
658
659
660
# File 'lib/daru/dataframe.rb', line 654

def each_index &block
  return to_enum(:each_index) unless block_given?

  @index.each(&block)

  self
end

#each_rowObject

Iterate over each row



687
688
689
690
691
692
693
694
695
# File 'lib/daru/dataframe.rb', line 687

def each_row
  return to_enum(:each_row) unless block_given?

  @index.size.times do |pos|
    yield row_at(pos)
  end

  self
end

#each_row_with_indexObject



697
698
699
700
701
702
703
704
705
# File 'lib/daru/dataframe.rb', line 697

def each_row_with_index
  return to_enum(:each_row_with_index) unless block_given?

  @index.each do |index|
    yield access_row(index), index
  end

  self
end

#each_vector(&block) ⇒ Object Also known as: each_column

Iterate over each vector



663
664
665
666
667
668
669
# File 'lib/daru/dataframe.rb', line 663

def each_vector(&block)
  return to_enum(:each_vector) unless block_given?

  @data.each(&block)

  self
end

#each_vector_with_indexObject Also known as: each_column_with_index

Iterate over each vector alongwith the name of the vector



674
675
676
677
678
679
680
681
682
# File 'lib/daru/dataframe.rb', line 674

def each_vector_with_index
  return to_enum(:each_vector_with_index) unless block_given?

  @vectors.each do |vector|
    yield @data[@vectors[vector]], vector
  end

  self
end

#filter(axis = :vector, &block) ⇒ Object

Retain vectors or rows if the block returns a truthy value.

Description

For filtering out certain rows/vectors based on their values, use the #filter method. By default it iterates over vectors and keeps those vectors for which the block returns true. It accepts an optional axis argument which lets you specify whether you want to iterate over vectors or rows.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.

Usage

# Filter vectors

df.filter do |vector|
  vector.type == :numeric and vector.median < 50
end

# Filter rows

df.filter(:row) do |row|
  row[:a] + row[:d] < 100
end


828
829
830
# File 'lib/daru/dataframe.rb', line 828

def filter axis=:vector, &block
  dispatch_to_axis_pl axis, :filter, &block
end

#filter_rowsObject

Iterates over each row and retains it in a new DataFrame if the block returns true for that row.



1011
1012
1013
1014
1015
1016
1017
# File 'lib/daru/dataframe.rb', line 1011

def filter_rows
  return to_enum(:filter_rows) unless block_given?

  keep_rows = @index.map { |index| yield access_row(index) }

  where keep_rows
end

#filter_vector(vec, &block) ⇒ Object

creates a new vector with the data of a given field which the block returns true



1005
1006
1007
# File 'lib/daru/dataframe.rb', line 1005

def filter_vector vec, &block
  Daru::Vector.new(each_row.select(&block).map { |row| row[vec] })
end

#filter_vectors(&block) ⇒ Object

Iterates over each vector and retains it in a new DataFrame if the block returns true for that vector.



1021
1022
1023
1024
1025
# File 'lib/daru/dataframe.rb', line 1021

def filter_vectors &block
  return to_enum(:filter_vectors) unless block_given?

  dup.tap { |df| df.keep_vector_if(&block) }
end

#get_vector_anyways(v) ⇒ Object



1336
1337
1338
# File 'lib/daru/dataframe.rb', line 1336

def get_vector_anyways(v)
  @vectors.include?(v) ? self[v].to_a : [nil] * size
end

#group_by(*vectors) ⇒ Object

Group elements by vector to perform operations on them. Returns a Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed list of possible operations.

Arguments

  • vectors - An Array contatining names of vectors to group by.

Usage

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a,:b,:c]).groups
#=> {["bar", "one", 2]=>[1],
# ["bar", "three", 1]=>[3],
# ["bar", "two", 6]=>[5],
# ["foo", "one", 1]=>[0],
# ["foo", "one", 3]=>[6],
# ["foo", "three", 8]=>[7],
# ["foo", "two", 3]=>[2, 4]}


1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
# File 'lib/daru/dataframe.rb', line 1311

def group_by *vectors
  vectors.flatten!
  # FIXME: wouldn't it better to do vectors - @vectors here and
  # raise one error with all non-existent vector names?.. - zverok, 2016-05-18
  vectors.each { |v|
    raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
  }

  vectors = [@vectors.first] if vectors.empty?

  Daru::Core::GroupBy.new(self, vectors)
end

#has_missing_data?Boolean Also known as: flawed?

Returns:

  • (Boolean)


1123
1124
1125
# File 'lib/daru/dataframe.rb', line 1123

def has_missing_data?
  @data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
end

#has_vector?(vector) ⇒ Boolean

Check if a vector is present

Returns:

  • (Boolean)


1198
1199
1200
# File 'lib/daru/dataframe.rb', line 1198

def has_vector? vector
  @vectors.include? vector
end

#head(quantity = 10) ⇒ Object Also known as: first

The first ten elements of the DataFrame

Parameters:

  • quantity (Fixnum) (defaults to: 10)

    (10) The number of elements to display from the top.



1246
1247
1248
# File 'lib/daru/dataframe.rb', line 1246

def head quantity=10
  row.at 0..(quantity-1)
end

#include_values?(*values) ⇒ true, false

Check if any of given values occur in the data frame

Examples:

df = Daru::DataFrame.new({
  a: [1,    2,          3,   nil,        Float::NAN, nil, 1,   7],
  b: [:a,  :b,          nil, Float::NAN, nil,        3,   5,   8],
  c: ['a',  Float::NAN, 3,   4,          3,          5,   nil, 7]
}, index: 11..18)
df.include_values? nil
# => true

Parameters:

  • *values (Array)

    values to check for

Returns:

  • (true, false)

    true if any of the given values occur in the dataframe, false otherwise



1142
1143
1144
# File 'lib/daru/dataframe.rb', line 1142

def include_values?(*values)
  @data.any? { |vec| vec.include_values?(*values) }
end

#inspect(spacing = 10, threshold = 15) ⇒ Object

Pretty print in a nice table format for the command line (irb/pry/iruby)



2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
# File 'lib/daru/dataframe.rb', line 2011

def inspect spacing=10, threshold=15
  name_part = @name ? ": #{@name} " : ''

  "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
    Formatters::Table.format(
      each_row.lazy,
      row_headers: row_headers,
      headers: headers,
      threshold: threshold,
      spacing: spacing
    )
end

#interact_code(vector_names, full) ⇒ Object



2073
2074
2075
2076
2077
2078
2079
2080
2081
# File 'lib/daru/dataframe.rb', line 2073

def interact_code vector_names, full
  dfs = vector_names.zip(full).map do |vec_name, f|
    self[vec_name].contrast_code(full: f).each.to_a
  end

  all_vectors = recursive_product(dfs)
  Daru::DataFrame.new all_vectors,
    order: all_vectors.map(&:name)
end

#join(other_df, opts = {}) ⇒ Daru::DataFrame

Join 2 DataFrames with SQL style joins. Currently supports inner, left outer, right outer and full outer joins.

Examples:

Inner Join

left = Daru::DataFrame.new({
  :id   => [1,2,3,4],
  :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
})
right = Daru::DataFrame.new({
  :id => [1,2,3,4],
  :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
})
left.join(right, how: :inner, on: [:name])
#=>
##<Daru::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
#                 id_1       name       id_2
#         0          1     Pirate          2
#         1          3      Ninja          4

Parameters:

  • other_df (Daru::DataFrame)

    Another DataFrame on which the join is to be performed.

  • opts (Hash) (defaults to: {})

    Options Hash

  • :how (Hash)

    a customizable set of options

  • :on (Hash)

    a customizable set of options

  • :indicator (Hash)

    a customizable set of options

Returns:



1725
1726
1727
# File 'lib/daru/dataframe.rb', line 1725

def join(other_df,opts={})
  Daru::Core::Merge.join(self, other_df, opts)
end

#keep_row_ifObject



992
993
994
995
996
# File 'lib/daru/dataframe.rb', line 992

def keep_row_if
  @index
    .reject { |idx| yield access_row(idx) }
    .each { |idx| delete_row idx }
end

#keep_vector_ifObject



998
999
1000
1001
1002
# File 'lib/daru/dataframe.rb', line 998

def keep_vector_if
  @vectors.each do |vector|
    delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
  end
end

#map(axis = :vector, &block) ⇒ Object

Map over each vector or row of the data frame according to the argument specified. Will return an Array of the resulting elements. To map over each row/vector and get a DataFrame, see #recode.

Description

The #map iterator works like Array#map. The value returned by each run of the block is added to an Array and the Array is returned. This method also accepts an axis argument, like #each. The default is :vector.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



759
760
761
# File 'lib/daru/dataframe.rb', line 759

def map axis=:vector, &block
  dispatch_to_axis_pl axis, :map, &block
end

#map!(axis = :vector, &block) ⇒ Object

Destructive map. Modifies the DataFrame. Each run of the block must return a Daru::Vector. You can specify the axis to map over as the argument. Default to :vector.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



771
772
773
774
775
776
777
# File 'lib/daru/dataframe.rb', line 771

def map! axis=:vector, &block
  if %i[vector column].include?(axis)
    map_vectors!(&block)
  elsif axis == :row
    map_rows!(&block)
  end
end

#map_rows(&block) ⇒ Object

Map each row



878
879
880
881
882
# File 'lib/daru/dataframe.rb', line 878

def map_rows &block
  return to_enum(:map_rows) unless block_given?

  each_row.map(&block)
end

#map_rows!Object



890
891
892
893
894
895
896
897
898
# File 'lib/daru/dataframe.rb', line 890

def map_rows!
  return to_enum(:map_rows!) unless block_given?

  index.dup.each do |i|
    row[i] = should_be_vector!(yield(row[i]))
  end

  self
end

#map_rows_with_index(&block) ⇒ Object



884
885
886
887
888
# File 'lib/daru/dataframe.rb', line 884

def map_rows_with_index &block
  return to_enum(:map_rows_with_index) unless block_given?

  each_row_with_index.map(&block)
end

#map_vectors(&block) ⇒ Object

Map each vector and return an Array.



853
854
855
856
857
# File 'lib/daru/dataframe.rb', line 853

def map_vectors &block
  return to_enum(:map_vectors) unless block_given?

  @data.map(&block)
end

#map_vectors!Object

Destructive form of #map_vectors



860
861
862
863
864
865
866
867
868
# File 'lib/daru/dataframe.rb', line 860

def map_vectors!
  return to_enum(:map_vectors!) unless block_given?

  vectors.dup.each do |n|
    self[n] = should_be_vector!(yield(self[n]))
  end

  self
end

#map_vectors_with_index(&block) ⇒ Object

Map vectors alongwith the index.



871
872
873
874
875
# File 'lib/daru/dataframe.rb', line 871

def map_vectors_with_index &block
  return to_enum(:map_vectors_with_index) unless block_given?

  each_vector_with_index.map(&block)
end

#merge(other_df) ⇒ Daru::DataFrame

Merge vectors from two DataFrames. In case of name collision, the vectors names are changed to x_1, x_2 .…

Returns:



1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
# File 'lib/daru/dataframe.rb', line 1679

def merge other_df # rubocop:disable Metrics/AbcSize
  unless nrows == other_df.nrows
    raise ArgumentError,
      "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
  end

  new_fields = (@vectors.to_a + other_df.vectors.to_a)
  new_fields = ArrayHelper.recode_repeated(new_fields)

  DataFrame.new({}, order: new_fields).tap do |df_new|
    (0...nrows).each do |i|
      df_new.add_row row[i].to_a + other_df.row[i].to_a
    end

    df_new.update
  end
end

#missing_values_rows(missing_values = [nil]) ⇒ Object Also known as: vector_missing_values

Return a vector with the number of missing values in each row.

Arguments

  • missing_values - An Array of the values that should be

treated as ‘missing’. The default missing value is nil.



1112
1113
1114
1115
1116
1117
1118
# File 'lib/daru/dataframe.rb', line 1112

def missing_values_rows missing_values=[nil]
  number_of_missing = each_row.map do |row|
    row.indexes(*missing_values).size
  end

  Daru::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
end

#ncolsObject

The number of vectors



1193
1194
1195
# File 'lib/daru/dataframe.rb', line 1193

def ncols
  @vectors.size
end

#nest(*tree_keys, &_block) ⇒ Object

Return a nested hash using vector names as keys and an array constructed of hashes with other values. If block provided, is used to provide the values, with parameters row of dataset, current last hash on hierarchy and name of the key to include



1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
# File 'lib/daru/dataframe.rb', line 1150

def nest *tree_keys, &_block
  tree_keys = tree_keys[0] if tree_keys[0].is_a? Array

  each_row.each_with_object({}) do |row, current|
    # Create tree
    *keys, last = tree_keys
    current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
    name = row[last]

    if block_given?
      current[name] = yield(row, current, name)
    else
      current[name] ||= []
      current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
    end
  end
end

#nrowsObject

The number of rows



1188
1189
1190
# File 'lib/daru/dataframe.rb', line 1188

def nrows
  @index.size
end

#numeric_vector_namesObject



1486
1487
1488
# File 'lib/daru/dataframe.rb', line 1486

def numeric_vector_names
  @vectors.select { |v| self[v].numeric? }
end

#numeric_vectorsObject

Return the indexes of all the numeric vectors. Will include vectors with nils alongwith numbers.



1479
1480
1481
1482
1483
1484
# File 'lib/daru/dataframe.rb', line 1479

def numeric_vectors
  # FIXME: Why _with_index ?..
  each_vector_with_index
    .select { |vec, _i| vec.numeric? }
    .map(&:last)
end

#one_to_many(parent_fields, pattern) ⇒ Object

Creates a new dataset for one to many relations on a dataset, based on pattern of field names.

for example, you have a survey for number of children with this structure:

id, name, child_name_1, child_age_1, child_name_2, child_age_2

with

ds.one_to_many([:id], "child_%v_%n"

the field of first parameters will be copied verbatim to new dataset, and fields which responds to second pattern will be added one case for each different %n.

Examples:

cases=[
  ['1','george','red',10,'blue',20,nil,nil],
  ['2','fred','green',15,'orange',30,'white',20],
  ['3','alfred',nil,nil,nil,nil,nil,nil]
]
ds=Daru::DataFrame.rows(cases, order:
  [:id, :name,
   :car_color1, :car_value1,
   :car_color2, :car_value2,
   :car_color3, :car_value3])
ds.one_to_many([:id],'car_%v%n').to_matrix
#=> Matrix[
#   ["red", "1", 10],
#   ["blue", "1", 20],
#   ["green", "2", 15],
#   ["orange", "2", 30],
#   ["white", "2", 20]
#   ]


1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
# File 'lib/daru/dataframe.rb', line 1760

def one_to_many(parent_fields, pattern)
  vars, numbers = one_to_many_components(pattern)

  DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
    each_row do |row|
      verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
      numbers.each do |n|
        generated = one_to_many_row row, n, vars, pattern
        next if generated.values.all?(&:nil?)

        ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
      end
    end
    ds.update
  end
end

#only_numerics(opts = {}) ⇒ Object

Return a DataFrame of only the numerical Vectors. If clone: false is specified as option, only a view of the Vectors will be returned. Defaults to clone: true.



1493
1494
1495
1496
1497
1498
1499
# File 'lib/daru/dataframe.rb', line 1493

def only_numerics opts={}
  cln = opts[:clone] == false ? false : true
  arry = numeric_vectors.map { |v| self[v] }

  order = Index.new(numeric_vectors)
  Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
end

#order=(order_array) ⇒ Object

Reorder the vectors in a dataframe

Examples:

df = Daru::DataFrame({
  a: [1, 2, 3],
  b: [4, 5, 6]
}, order: [:a, :b])
df.order = [:b, :a]
df
# => #<Daru::DataFrame(3x2)>
#       b   a
#   0   4   1
#   1   5   2
#   2   6   3

Parameters:

  • order_array (Array)

    new order of the vectors

Raises:

  • (ArgumentError)


1083
1084
1085
1086
1087
# File 'lib/daru/dataframe.rb', line 1083

def order=(order_array)
  raise ArgumentError, 'Invalid order' unless
    order_array.sort == vectors.to_a.sort
  initialize(to_h, order: order_array)
end

#pivot_table(opts = {}) ⇒ Object

Pivots a data frame on specified vectors and applies an aggregate function to quickly generate a summary.

Options

:index - Keys to group by on the pivot table row index. Pass vector names contained in an Array.

:vectors - Keys to group by on the pivot table column index. Pass vector names contained in an Array.

:agg - Function to aggregate the grouped values. Default to :mean. Can use any of the statistics functions applicable on Vectors that can be found in the Daru::Statistics::Vector module.

:values - Columns to aggregate. Will consider all numeric columns not specified in :index or :vectors. Optional.

Usage

df = Daru::DataFrame.new({
  a: ['foo'  ,  'foo',  'foo',  'foo',  'foo',  'bar',  'bar',  'bar',  'bar'],
  b: ['one'  ,  'one',  'one',  'two',  'two',  'one',  'one',  'two',  'two'],
  c: ['small','large','large','small','small','large','small','large','small'],
  d: [1,2,2,3,3,4,5,6,7],
  e: [2,4,4,6,6,8,10,12,14]
})
df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)

#=>
# #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
#            [:e, :one] [:e, :two]
#     [:bar]         18         26
#     [:foo]         10         12

Raises:

  • (ArgumentError)


1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
# File 'lib/daru/dataframe.rb', line 1658

def pivot_table opts={}
  raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?

  index               = opts[:index]
  vectors             = opts[:vectors] || []
  aggregate_function  = opts[:agg] || :mean
  values              = prepare_pivot_values index, vectors, opts
  raise IndexError, 'No numeric vectors to aggregate' if values.empty?

  grouped = group_by(index)
  return grouped.send(aggregate_function) if vectors.empty?

  super_hash = make_pivot_hash grouped, vectors, values, aggregate_function

  pivot_dataframe super_hash
end

#plotting_library=(lib) ⇒ Object



354
355
356
357
358
359
360
361
362
363
364
365
366
367
# File 'lib/daru/dataframe.rb', line 354

def plotting_library= lib
  case lib
  when :gruff, :nyaplot
    @plotting_library = lib
    if Daru.send("has_#{lib}?".to_sym)
      extend Module.const_get(
        "Daru::Plotting::DataFrame::#{lib.to_s.capitalize}Library"
      )
    end
  else
    raise ArguementError, "Plotting library #{lib} not supported. "\
      'Supported libraries are :nyaplot and :gruff'
  end
end

#recast(opts = {}) ⇒ Object

Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype

Usage

df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
df.recast a: :nmatrix, c: :nmatrix


1993
1994
1995
1996
1997
# File 'lib/daru/dataframe.rb', line 1993

def recast opts={}
  opts.each do |vector_name, dtype|
    self[vector_name].cast(dtype: dtype)
  end
end

#recode(axis = :vector, &block) ⇒ Object

Maps over the DataFrame and returns a DataFrame. Each run of the block must return a Daru::Vector object. You can specify the axis to map over. Default to :vector.

Description

Recode works similarly to #map, but an important difference between the two is that recode returns a modified Daru::DataFrame instead of an Array. For this reason, #recode expects that every run of the block to return a Daru::Vector.

Just like map and each, recode also accepts an optional axis argument.

Arguments

  • axis - The axis to map over. Can be :vector (or :column) or :row.

Default to :vector.



796
797
798
# File 'lib/daru/dataframe.rb', line 796

def recode axis=:vector, &block
  dispatch_to_axis_pl axis, :recode, &block
end

#recode_rowsObject



842
843
844
845
846
847
848
849
850
# File 'lib/daru/dataframe.rb', line 842

def recode_rows
  block_given? or return to_enum(:recode_rows)

  dup.tap do |df|
    df.each_row_with_index do |r, i|
      df.row[i] = should_be_vector!(yield(r))
    end
  end
end

#recode_vectorsObject



832
833
834
835
836
837
838
839
840
# File 'lib/daru/dataframe.rb', line 832

def recode_vectors
  block_given? or return to_enum(:recode_vectors)

  dup.tap do |df|
    df.each_vector_with_index do |v, i|
      df[*i] = should_be_vector!(yield(v))
    end
  end
end

#reindex(new_index) ⇒ Object

Change the index of the DataFrame and preserve the labels of the previous indexing. New index can be Daru::Index or any of its subclasses.

Examples:

Reindexing DataFrame

df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
  index: ['a','b','c','d'])
#=>
##<Daru::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
#                    a          b
#         a          1         11
#         b          2         22
#         c          3         33
#         d          4         44
df.reindex Daru::Index.new(['b', 0, 'a', 'g'])
#=>
##<Daru::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
#                    a          b
#         b          2         22
#         0        nil        nil
#         a          1         11
#         g        nil        nil

Parameters:

  • new_index (Daru::Index)

    The new Index for reindexing the DataFrame.



1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
# File 'lib/daru/dataframe.rb', line 1398

def reindex new_index
  unless new_index.is_a?(Daru::Index)
    raise ArgumentError, 'Must pass the new index of type Index or its '\
      "subclasses, not #{new_index.class}"
  end

  cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
  new_index.each_with_object(cl) do |idx, memo|
    memo.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
  end
end

#reindex_vectors(new_vectors) ⇒ Object



1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
# File 'lib/daru/dataframe.rb', line 1324

def reindex_vectors new_vectors
  unless new_vectors.is_a?(Daru::Index)
    raise ArgumentError, 'Must pass the new index of type Index or its '\
      "subclasses, not #{new_index.class}"
  end

  cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
  new_vectors.each_with_object(cl) do |vec, memo|
    memo[vec] = @vectors.include?(vec) ? self[vec] : [nil]*nrows
  end
end

#reject_values(*values) ⇒ Daru::DataFrame

Returns a dataframe in which rows with any of the mentioned values

are ignored.

Examples:

df = Daru::DataFrame.new({
  a: [1,    2,          3,   nil,        Float::NAN, nil, 1,   7],
  b: [:a,  :b,          nil, Float::NAN, nil,        3,   5,   8],
  c: ['a',  Float::NAN, 3,   4,          3,          5,   nil, 7]
}, index: 11..18)
df.reject_values nil, Float::NAN
# => #<Daru::DataFrame(2x3)>
#       a   b   c
#   11   1   a   a
#   18   7   8   7

Parameters:

  • *values (Array)

    values to reject to form the new dataframe

Returns:

  • (Daru::DataFrame)

    Data Frame with only rows which doesn’t contain the mentioned values



614
615
616
617
618
619
620
621
622
623
624
# File 'lib/daru/dataframe.rb', line 614

def reject_values(*values)
  positions =
    size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
  # Handle the case when positions size is 1 and #row_at wouldn't return a df
  if positions.size == 1
    pos = positions.first
    row_at(pos..pos)
  else
    row_at(*positions)
  end
end

#rename(new_name) ⇒ Object Also known as: name=

Rename the DataFrame.



1918
1919
1920
1921
# File 'lib/daru/dataframe.rb', line 1918

def rename new_name
  @name = new_name
  self
end

#rename_vectors(name_map) ⇒ Object

Renames the vectors

Arguments

  • name_map - A hash where the keys are the exising vector names and

    the values are the new names.  If a vector is renamed
    to a vector name that is already in use, the existing
    one is overwritten.
    

Usage

df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
df.rename_vectors :a => :alpha, :c => :gamma
df.vectors.to_a #=> [:alpha, :b, :gamma]


1469
1470
1471
1472
1473
1474
1475
# File 'lib/daru/dataframe.rb', line 1469

def rename_vectors name_map
  existing_targets = name_map.reject { |k,v| k == v }.values & vectors.to_a
  delete_vectors(*existing_targets)

  new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
  self.vectors = Daru::Index.new new_names
end

#replace_values(old_values, new_value) ⇒ Daru::DataFrame

Replace specified values with given value

Examples:

df = Daru::DataFrame.new({
  a: [1,    2,          3,   nil,        Float::NAN, nil, 1,   7],
  b: [:a,  :b,          nil, Float::NAN, nil,        3,   5,   8],
  c: ['a',  Float::NAN, 3,   4,          3,          5,   nil, 7]
}, index: 11..18)
df.replace_values nil, Float::NAN
# => #<Daru::DataFrame(8x3)>
#       a   b   c
#   11   1   a   a
#   12   2   b NaN
#   13   3 NaN   3
#   14 NaN NaN   4
#   15 NaN NaN   3
#   16 NaN   3   5
#   17   1   5 NaN
#   18   7   8   7

Parameters:

  • old_values (Array)

    values to replace with new value

  • new_value (object)

    new value to replace with

Returns:

  • (Daru::DataFrame)

    Data Frame itself with old values replace with new value



648
649
650
651
# File 'lib/daru/dataframe.rb', line 648

def replace_values old_values, new_value
  @data.each { |vec| vec.replace_values old_values, new_value }
  self
end

#respond_to_missing?(name, include_private = false) ⇒ Boolean

Returns:

  • (Boolean)


2069
2070
2071
# File 'lib/daru/dataframe.rb', line 2069

def respond_to_missing?(name, include_private=false)
  name.to_s.end_with?('=') || has_vector?(name) || super
end

#rowObject

Access a row or set/create a row. Refer #[] and #[]= docs for details.

Usage

df.row[:a] # access row named ':a'
df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]


537
538
539
# File 'lib/daru/dataframe.rb', line 537

def row
  Daru::Accessors::DataFrameByRow.new(self)
end

#row_at(*positions) ⇒ Daru::Vector, Daru::DataFrame

Retrive rows by positions

Examples:

df = Daru::DataFrame.new({
  a: [1, 2, 3],
  b: ['a', 'b', 'c']
})
df.row_at 1, 2
# => #<Daru::DataFrame(2x2)>
#       a   b
#   1   2   b
#   2   3   c

Parameters:

  • *positions (Array<Integer>)

    positions of rows to retrive

Returns:



390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
# File 'lib/daru/dataframe.rb', line 390

def row_at *positions
  original_positions = positions
  positions = coerce_positions(*positions, nrows)
  validate_positions(*positions, nrows)

  if positions.is_a? Integer
    return Daru::Vector.new @data.map { |vec| vec.at(*positions) },
      index: @vectors
  else
    new_rows = @data.map { |vec| vec.at(*original_positions) }
    return Daru::DataFrame.new new_rows,
      index: @index.at(*original_positions),
      order: @vectors
  end
end

#save(filename) ⇒ Object

Use marshalling to save dataframe to a file.



1967
1968
1969
# File 'lib/daru/dataframe.rb', line 1967

def save filename
  Daru::IO.save self, filename
end

#set_at(positions, vector) ⇒ Object

Set vectors by positions

Examples:

df = Daru::DataFrame.new({
  a: [1, 2, 3],
  b: ['a', 'b', 'c']
})
df.set_at [0], ['x', 'y', 'z']
df
#=> #<Daru::DataFrame(3x2)>
#       a   b
#   0   x   a
#   1   y   b
#   2   z   c

Parameters:

  • positions (Array<Integer>)

    positions of vectors to set

  • vector (Array, Daru::Vector)

    vector to be assigned

Raises:



489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
# File 'lib/daru/dataframe.rb', line 489

def set_at positions, vector
  if positions.last == :row
    positions.pop
    return set_row_at(positions, vector)
  end

  validate_positions(*positions, ncols)
  vector =
    if vector.is_a? Daru::Vector
      vector.reindex @index
    else
      Daru::Vector.new vector
    end

  raise SizeError, 'Vector length should match index length' if
    vector.size != @index.size

  positions.each { |pos| @data[pos] = vector }
end

#set_index(new_index, opts = {}) ⇒ Object

Set a particular column as the new DF

Raises:

  • (ArgumentError)


1366
1367
1368
1369
1370
1371
1372
1373
1374
# File 'lib/daru/dataframe.rb', line 1366

def set_index new_index, opts={}
  raise ArgumentError, 'All elements in new index must be unique.' if
    @size != self[new_index].uniq.size

  self.index = Daru::Index.new(self[new_index].to_a)
  delete_vector(new_index) unless opts[:keep]

  self
end

#set_row_at(positions, vector) ⇒ Object

Set rows by positions

Examples:

df = Daru::DataFrame.new({
  a: [1, 2, 3],
  b: ['a', 'b', 'c']
})
df.set_row_at [0, 1], ['x', 'x']
df
#=> #<Daru::DataFrame(3x2)>
#       a   b
#   0   x   x
#   1   x   x
#   2   3   c

Parameters:

  • positions (Array<Integer>)

    positions of rows to set

Raises:



421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
# File 'lib/daru/dataframe.rb', line 421

def set_row_at positions, vector
  validate_positions(*positions, nrows)
  vector =
    if vector.is_a? Daru::Vector
      vector.reindex @vectors
    else
      Daru::Vector.new vector
    end

  raise SizeError, 'Vector length should match row length' if
    vector.size != @vectors.size

  @data.each_with_index do |vec, pos|
    vec.set_at(positions, vector.at(pos))
  end
  @index = @data[0].index
  set_size
end

#shapeObject

Return the number of rows and columns of the DataFrame in an Array.



1183
1184
1185
# File 'lib/daru/dataframe.rb', line 1183

def shape
  [nrows, ncols]
end

#sort(vector_order, opts = {}) ⇒ Object

Non-destructive version of #sort!



1620
1621
1622
# File 'lib/daru/dataframe.rb', line 1620

def sort vector_order, opts={}
  dup.sort! vector_order, opts
end

#sort!(vector_order, opts = {}) ⇒ Object

Sorts a dataframe (ascending/descending) in the given pripority sequence of vectors, with or without a block.

Examples:

Sort a dataframe with a vector sequence.


df = Daru::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})

df.sort [:a, :b]
# =>
# <Daru::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
#                   a          b
#        2          1          3
#        0          1          5
#        3          2          2
#        1          2          4
#        4          3          1

Sort a dataframe without a block. Here nils will be handled automatically.


df = Daru::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})

df.sort([:a])
# =>
# <Daru::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
#                    a          b
#         1        nil          3
#         3        nil          1
#         0         -3          4
#         2         -1          2
#         4          5          4

Sort a dataframe with a block with nils handled automatically.


df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })

df.sort [:b], by: {b: lambda { |a| a.length } }
# NoMethodError: undefined method `length' for nil:NilClass
# from (pry):8:in `block in __pry__'

df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true

# =>
# <Daru::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
#                    a          b
#         2          1        nil
#         5          1        nil
#         4         -1          x
#         1         -1         aa
#         0        nil        aaa
#         3        nil       baaa

Sort a dataframe with a block with nils handled manually.


df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })

# To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true

# =>
#<Daru::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
#                 a          b
#      4         -1          x
#      1         -1         aa
#      0        nil        aaa
#      3        nil       baaa
#      2          1        nil
#      5          1        nil

Parameters:

  • order (Array)

    The order of vector names in which the DataFrame should be sorted.

  • opts (Hash) (defaults to: {})

    The options to sort with.

Options Hash (opts):

  • :ascending (TrueClass, FalseClass, Array) — default: true

    Sort in ascending or descending order. Specify Array corresponding to order for multiple sort orders.

  • :by (Hash) — default: lambda{|a| a }

    Specify attributes of objects to to be used for sorting, for each vector name in order as a hash of vector name and lambda expressions. In case a lambda for a vector is not specified, the default will be used.

  • :handle_nils (TrueClass, FalseClass, Array) — default: false

    Handle nils automatically or not when a block is provided. If set to True, nils will appear at top after sorting.

Raises:

  • (ArgumentError)


1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
# File 'lib/daru/dataframe.rb', line 1596

def sort! vector_order, opts={}
  raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?

  # To enable sorting with categorical data,
  # map categories to integers preserving their order
  old = convert_categorical_vectors vector_order
  block = sort_prepare_block vector_order, opts

  order = @index.size.times.sort(&block)
  new_index = @index.reorder order

  # To reverse map mapping of categorical data to integers
  restore_categorical_vectors old

  @data.each do |vector|
    vector.reorder! order
  end

  self.index = new_index

  self
end

#split_by_category(cat_name) ⇒ Array

Split the dataframe into many dataframes based on category vector

Examples:

df = Daru::DataFrame.new({
  a: [1, 2, 3],
  b: ['a', 'a', 'b']
})
df.to_category :b
df.split_by_category :b
# => [#<Daru::DataFrame: a (2x1)>
#       a
#   0   1
#   1   2,
# #<Daru::DataFrame: b (1x1)>
#       a
#   2   3]

Parameters:

  • cat_name (object)

    name of category vector to split the dataframe

Returns:

  • (Array)

    array of dataframes split by category with category vector used to split not included

Raises:

  • (ArguementError)


2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
# File 'lib/daru/dataframe.rb', line 2101

def split_by_category cat_name
  cat_dv = self[cat_name]
  raise ArguementError, "#{cat_name} is not a category vector" unless
    cat_dv.category?

  cat_dv.categories.map do |cat|
    where(cat_dv.eq cat)
      .rename(cat)
      .delete_vector cat_name
  end
end

#summaryString

Generate a summary of this DataFrame based on individual vectors in the DataFrame

Returns:

  • (String)

    String containing the summary of the DataFrame



1503
1504
1505
1506
1507
1508
1509
1510
1511
# File 'lib/daru/dataframe.rb', line 1503

def summary
  summary = "= #{name}"
  summary << "\n  Number of rows: #{nrows}"
  @vectors.each do |v|
    summary << "\n  Element:[#{v}]\n"
    summary << self[v].summary(1)
  end
  summary
end

#tail(quantity = 10) ⇒ Object Also known as: last

The last ten elements of the DataFrame

Parameters:

  • quantity (Fixnum) (defaults to: 10)

    (10) The number of elements to display from the bottom.



1255
1256
1257
1258
# File 'lib/daru/dataframe.rb', line 1255

def tail quantity=10
  start = [-quantity, -size].max
  row.at start..-1
end

#to_aObject

Converts the DataFrame into an array of hashes where key is vector name and value is the corresponding element. The 0th index of the array contains the array of hashes while the 1th index contains the indexes of each row of the dataframe. Each element in the index array corresponds to its row in the array of hashes, which has the same index.



1850
1851
1852
# File 'lib/daru/dataframe.rb', line 1850

def to_a
  [each_row.map(&:to_h), @index.to_a]
end

#to_category(*names) ⇒ Daru::DataFrame

Converts the specified non category type vectors to category type vectors

Examples:

df = Daru::DataFrame.new({
  a: [1, 2, 3],
  b: ['a', 'a', 'b']
})
df.to_category :b
df[:b].type
# => :category

Parameters:

  • *names (Array)

    names of non category type vectors to be converted

Returns:

  • (Daru::DataFrame)

    data frame in which specified vectors have been converted to category type



2049
2050
2051
2052
# File 'lib/daru/dataframe.rb', line 2049

def to_category *names
  names.each { |n| self[n] = self[n].to_category }
  self
end

#to_dfself

Returns the dataframe. This can be convenient when the user does not know whether the object is a vector or a dataframe.

Returns:

  • (self)

    the dataframe



1815
1816
1817
# File 'lib/daru/dataframe.rb', line 1815

def to_df
  self
end

#to_gslObject

Convert all numeric vectors to GSL::Matrix



1820
1821
1822
1823
1824
# File 'lib/daru/dataframe.rb', line 1820

def to_gsl
  numerics_as_arrays = numeric_vectors.map { |n| self[n].to_a }

  GSL::Matrix.alloc(*numerics_as_arrays.transpose)
end

#to_hObject

Converts DataFrame to a hash (explicit) with keys as vector names and values as the corresponding vectors.



1866
1867
1868
1869
1870
# File 'lib/daru/dataframe.rb', line 1866

def to_h
  @vectors
    .each_with_index
    .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
end

#to_html(threshold = 30) ⇒ Object

Convert to html for IRuby.



1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
# File 'lib/daru/dataframe.rb', line 1873

def to_html(threshold=30)
  table_thead = to_html_thead
  table_tbody = to_html_tbody(threshold)
  path = if index.is_a?(MultiIndex)
           File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
         else
           File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__)
         end
  ERB.new(File.read(path).strip).result(binding)
end

#to_html_tbody(threshold = 30) ⇒ Object



1894
1895
1896
1897
1898
1899
1900
1901
1902
# File 'lib/daru/dataframe.rb', line 1894

def to_html_tbody(threshold=30)
  table_tbody_path =
    if index.is_a?(MultiIndex)
      File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __FILE__)
    else
      File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __FILE__)
    end
  ERB.new(File.read(table_tbody_path).strip).result(binding)
end

#to_html_theadObject



1884
1885
1886
1887
1888
1889
1890
1891
1892
# File 'lib/daru/dataframe.rb', line 1884

def to_html_thead
  table_thead_path =
    if index.is_a?(MultiIndex)
      File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __FILE__)
    else
      File.expand_path('../iruby/templates/dataframe_thead.html.erb', __FILE__)
    end
  ERB.new(File.read(table_thead_path).strip).result(binding)
end

#to_json(no_index = true) ⇒ Object

Convert to json. If no_index is false then the index will NOT be included in the JSON thus created.



1856
1857
1858
1859
1860
1861
1862
# File 'lib/daru/dataframe.rb', line 1856

def to_json no_index=true
  if no_index
    to_a[0].to_json
  else
    to_a.to_json
  end
end

#to_matrixObject

Convert all vectors of type :numeric into a Matrix.



1827
1828
1829
# File 'lib/daru/dataframe.rb', line 1827

def to_matrix
  Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
end

#to_nmatrixObject

Convert all vectors of type :numeric and not containing nils into an NMatrix.



1839
1840
1841
1842
1843
# File 'lib/daru/dataframe.rb', line 1839

def to_nmatrix
  each_vector.select do |vector|
    vector.numeric? && !vector.include_values?(*Daru::MISSING_VALUES)
  end.map(&:to_a).transpose.to_nm
end

#to_nyaplotdfObject

Return a Nyaplot::DataFrame from the data of this DataFrame. :nocov:



1833
1834
1835
# File 'lib/daru/dataframe.rb', line 1833

def to_nyaplotdf
  Nyaplot::DataFrame.new(to_a[0])
end

#to_REXPObject

rubocop:disable Style/MethodName



5
6
7
8
9
10
11
12
13
# File 'lib/daru/extensions/rserve.rb', line 5

def to_REXP # rubocop:disable Style/MethodName
  names = @vectors.to_a
  data  = names.map do |f|
    Rserve::REXP::Wrapper.wrap(self[f].to_a)
  end
  l = Rserve::Rlist.new(data, names.map(&:to_s))

  Rserve::REXP.create_data_frame(l)
end

#to_sObject



1904
1905
1906
# File 'lib/daru/dataframe.rb', line 1904

def to_s
  "#<#{self.class}#{': ' + @name.to_s if @name}(#{nrows}x#{ncols})>"
end

#transposeObject

Transpose a DataFrame, tranposing elements and row, column indexing.



2000
2001
2002
2003
2004
2005
2006
2007
2008
# File 'lib/daru/dataframe.rb', line 2000

def transpose
  Daru::DataFrame.new(
    each_vector.map(&:to_a).transpose,
    index: @vectors,
    order: @index,
    dtype: @dtype,
    name: @name
  )
end

#union(other_df) ⇒ Object

Concatenates another DataFrame as #concat. Additionally it tries to preserve the index. If the indices contain common elements, #union will overwrite the according rows in the first dataframe.



1356
1357
1358
1359
1360
1361
1362
1363
# File 'lib/daru/dataframe.rb', line 1356

def union other_df
  index = (@index.to_a + other_df.index.to_a).uniq
  df = row[*(@index.to_a - other_df.index.to_a)]

  df = df.concat(other_df)
  df.index = Daru::Index.new(index)
  df
end

#updateObject

Method for updating the metadata (i.e. missing value positions) of the after assingment/deletion etc. are complete. This is provided so that time is not wasted in creating the metadata for the vector each time assignment/deletion of elements is done. Updating data this way is called lazy loading. To set or unset lazy loading, see the .lazy_update= method.



1913
1914
1915
# File 'lib/daru/dataframe.rb', line 1913

def update
  @data.each(&:update) if Daru.lazy_update
end

#vector_by_calculation(&block) ⇒ Object

DSL for yielding each row and returning a Daru::Vector based on the value each run of the block returns.

Usage

a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
total = ds.vector_by_calculation { a + b + c }
# <Daru::Vector:82314050 @name = nil @size = 7 >
#   nil
# 0 111
# 1 222
# 2 333
# 3 444
# 4 555
# 5 666
# 6 777


1063
1064
1065
1066
1067
# File 'lib/daru/dataframe.rb', line 1063

def vector_by_calculation &block
  a = each_row.map { |r| r.instance_eval(&block) }

  Daru::Vector.new a, index: @index
end

#vector_count_characters(vecs = nil) ⇒ Object



1168
1169
1170
1171
1172
1173
1174
# File 'lib/daru/dataframe.rb', line 1168

def vector_count_characters vecs=nil
  vecs ||= @vectors.to_a

  collect_rows do |row|
    vecs.map { |v| row[v].to_s.size }.inject(:+)
  end
end

#vector_mean(max_missing = 0) ⇒ Object

Calculate mean of the rows of the dataframe.

Arguments

  • max_missing - The maximum number of elements in the row that can be

zero for the mean calculation to happen. Default to 0.



1277
1278
1279
1280
1281
1282
1283
1284
1285
# File 'lib/daru/dataframe.rb', line 1277

def vector_mean max_missing=0
  # FIXME: in vector_sum we preserve created vector dtype, but
  # here we are not. Is this by design or ...? - zverok, 2016-05-18
  mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"

  each_row_with_index.each_with_object(mean_vec) do |(row, i), memo|
    memo[i] = row.indexes(*Daru::MISSING_VALUES).size > max_missing ? nil : row.mean
  end
end

#vector_sum(vecs = nil) ⇒ Object

Returns a vector with sum of all vectors specified in the argument. If vecs parameter is empty, sum all numeric vector.



1264
1265
1266
1267
1268
1269
# File 'lib/daru/dataframe.rb', line 1264

def vector_sum vecs=nil
  vecs ||= numeric_vectors
  sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype

  vecs.inject(sum) { |memo, n| memo + self[n] }
end

#verify(*tests) ⇒ Object

Test each row with one or more tests. Each test is a Proc with the form *Proc.new {|row| row > 0}*

The function returns an array with all errors.

FIXME: description here is too sparse. As far as I can get, it should tell something about that each test is [descr, fields, block], and that first value may be column name to output. - zverok, 2016-05-18



1035
1036
1037
1038
1039
1040
1041
1042
# File 'lib/daru/dataframe.rb', line 1035

def verify(*tests)
  id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first

  each_row_with_index.map do |row, i|
    tests.reject { |*_, block| block.call(row) }
         .map { |test| verify_error_message row, test, id, i }
  end.flatten
end

#where(bool_array) ⇒ Object

Query a DataFrame by passing a Daru::Core::Query::BoolArray object.



2025
2026
2027
# File 'lib/daru/dataframe.rb', line 2025

def where bool_array
  Daru::Core::Query.df_where self, bool_array
end

#write_csv(filename, opts = {}) ⇒ Object

Write this DataFrame to a CSV file.

Arguements

  • filename - Path of CSV file where the DataFrame is to be saved.

Options

  • convert_comma - If set to true, will convert any commas in any

of the data to full stops (‘.’). All the options accepted by CSV.read() can also be passed into this function.



1937
1938
1939
# File 'lib/daru/dataframe.rb', line 1937

def write_csv filename, opts={}
  Daru::IO.dataframe_write_csv self, filename, opts
end

#write_excel(filename, opts = {}) ⇒ Object

Write this dataframe to an Excel Spreadsheet

Arguments

  • filename - The path of the file where the DataFrame should be written.



1946
1947
1948
# File 'lib/daru/dataframe.rb', line 1946

def write_excel filename, opts={}
  Daru::IO.dataframe_write_excel self, filename, opts
end

#write_sql(dbh, table) ⇒ Object

Insert each case of the Dataset on the selected table

Arguments

  • dbh - DBI database connection object.

  • query - Query string.

Usage

ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
ds.write_sql(dbh,"test")


1962
1963
1964
# File 'lib/daru/dataframe.rb', line 1962

def write_sql dbh, table
  Daru::IO.dataframe_write_sql self, dbh, table
end