Class: Daru::Core::GroupBy

Inherits:

Object
Daru::Core::GroupBy

Defined in:: lib/daru/core/group_by.rb

Constant Summary collapse

TUPLE_SORTER =

lambda do |left, right|
  return -1 unless right
  return 1 unless left

  left = left.compact
  right = right.compact
  return left <=> right || 0 if left.length == right.length
  left.length <=> right.length
end

Class Method Summary collapse

Instance Method Summary collapse

#aggregate(options = {}) ⇒ Daru::DataFrame

Function to use for aggregating the data.
#count ⇒ Object

Count groups, excludes missing values.
#df ⇒ Object (also: #grouped_df)

lazy accessor/attr_reader for the attribute df.
#each_group ⇒ Object

Iterate over each group created by group_by.
#first ⇒ Object

Get the first group.
#get_group(group) ⇒ Object

Returns one of the selected groups as a DataFrame.
#groups ⇒ Object (also: #groups_by_idx)

lazy accessor/attr_reader for the attribute groups.
#head(quantity = 5) ⇒ Object

Get the top ‘n’ groups.
#initialize(context, names) ⇒ GroupBy constructor

A new instance of GroupBy.
#inspect ⇒ Object
#last ⇒ Object

Get the last group.
#max ⇒ Object

Find the max element of each numeric vector group.
#mean ⇒ Object

Calculate mean of numeric groups, excluding missing values.
#median ⇒ Object

Calculate the median of numeric groups, excluding missing values.
#min ⇒ Object

Find the min element of each numeric vector group.
#reduce(init = nil) {|block| ... } ⇒ Object

Iteratively applies a function to the values in a group and accumulates the result.
#size ⇒ Object

Get a Daru::Vector of the size of each group.
#std ⇒ Object

Calculate sample standard deviation of numeric vector groups, excluding missing values.
#sum ⇒ Object

Calculate sum of numeric groups, excluding missing values.
#tail(quantity = 5) ⇒ Object

Get the bottom ‘n’ groups.

Constructor Details

#initialize(context, names) ⇒ `GroupBy`

Returns a new instance of GroupBy.

# File 'lib/daru/core/group_by.rb', line 92

def initialize context, names
  @group_vectors     = names
  @non_group_vectors = context.vectors.to_a - names

  @context = context # TODO: maybe rename in @original_df or @grouped_db

  # FIXME: It feels like we don't want to sort here. Ruby's #group_by
  # never sorts:
  #
  #   ['test', 'me', 'please'].group_by(&:size)
  #   #  => {4=>["test"], 2=>["me"], 6=>["please"]}
  #
  # - zverok, 2016-09-12
  @groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
end

Class Method Details

.df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ `Object`

# File 'lib/daru/core/group_by.rb', line 44

def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
  return nil if group_map == {}

  new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
  new_index = Daru::MultiIndex.from_tuples(new_index)

  return Daru::DataFrame.new({}, index: new_index) if remaining_vectors == []

  new_rows_order = group_map.values.flatten
  new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
  new_df.index = new_index

  new_df
end

.get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ `Object`

# File 'lib/daru/core/group_by.rb', line 22

def get_positions_group_for_aggregation(multi_index, level=-1)
  raise unless multi_index.is_a?(Daru::MultiIndex)

  new_index = multi_index.dup
  new_index.remove_layer(level) # TODO: recheck code of Daru::MultiIndex#remove_layer

  get_positions_group_map_on(new_index.each_with_index)
end

.get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ `Object`

# File 'lib/daru/core/group_by.rb', line 32

def get_positions_group_map_for_df(df, group_by_keys, sort: true)
  indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index

  get_positions_group_map_on(indexes_with_positions, sort: sort)
end

.get_positions_group_map_on(indexes_with_positions, sort: false) ⇒ `Object`

# File 'lib/daru/core/group_by.rb', line 6

def get_positions_group_map_on(indexes_with_positions, sort: false)
  group_map = {}

  indexes_with_positions.each do |idx, position|
    (group_map[idx] ||= []) << position
  end

  if sort # TODO: maybe add a more "stable" sorting option?
    sorted_keys = group_map.keys.sort(&Daru::Core::GroupBy::TUPLE_SORTER)
    group_map = sorted_keys.map { |k| [k, group_map[k]] }.to_h
  end

  group_map
end

.group_map_from_positions_to_indexes(positions_group_map, index) ⇒ `Object`



39
40
41

# File 'lib/daru/core/group_by.rb', line 39

def group_map_from_positions_to_indexes(positions_group_map, index)
  positions_group_map.map { |k, positions| [k, positions.map { |pos| index.at(pos) }] }.to_h
end

Instance Method Details

#aggregate(options = {}) ⇒ `Daru::DataFrame`

Function to use for aggregating the data. ‘group_by` is using Daru::DataFrame#aggregate

Examples:


df = Daru::DataFrame.new(
  name: ['Ram','Krishna','Ram','Krishna','Krishna'],
  visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])

=> #<Daru::DataFrame(5x2)>
                name   visited
         0       Ram Hyderabad
         1   Krishna     Delhi
         2       Ram    Mumbai
         3   Krishna    Raipur
         4   Krishna  Banglore

df.group_by(:name)
=> #<Daru::DataFrame(5x1)>
                       visited
   Krishna         1     Delhi
                   3    Raipur
                   4  Banglore
       Ram         0 Hyderabad
                   2    Mumbai

df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
=> #<Daru::DataFrame(2x1)>
               visited
    Krishna Delhi,Raipur,Banglore
        Ram Hyderabad,Mumbai

Parameters:

options (Hash) (defaults to: {}) —

options for column, you want in resultant dataframe

Returns:

(Daru::DataFrame)

# File 'lib/daru/core/group_by.rb', line 342

def aggregate(options={})
  new_index = get_grouped_index

  @context.aggregate(options) { [@groups_by_pos.values, new_index] }
end

#count ⇒ `Object`

Count groups, excludes missing values.

Examples:

Using count

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).count
# =>
# #<Daru::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
#                           c          d
# ["bar", "one"]            1          1
# ["bar", "two"]            1          1
# ["bar", "three"]          1          1
# ["foo", "one"]            2          2
# ["foo", "three"]          1          1
# ["foo", "two"]            2          2

# File 'lib/daru/core/group_by.rb', line 221

def count
  width = @non_group_vectors.size
  Daru::DataFrame.new([size]*width, order: @non_group_vectors)
end

#df ⇒ `Object` Also known as: grouped_df

lazy accessor/attr_reader for the attribute df



67
68
69

# File 'lib/daru/core/group_by.rb', line 67

def df
  @df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
end

#each_group ⇒ `Object`

Iterate over each group created by group_by. A DataFrame is yielded in block.

# File 'lib/daru/core/group_by.rb', line 74

def each_group
  return to_enum(:each_group) unless block_given?

  groups.keys.each do |k|
    yield get_group(k)
  end
end

#first ⇒ `Object`

Get the first group



117
118
119

# File 'lib/daru/core/group_by.rb', line 117

def first
  head(1)
end

#get_group(group) ⇒ `Object`

Returns one of the selected groups as a DataFrame.

Examples:

Getting a group


df = Daru::DataFrame.new({
      a: %w{foo bar foo bar   foo bar foo foo},
      b: %w{one one two three two two one three},
      c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
      d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
    })
df.group_by([:a, :b]).get_group ['bar','two']
#=>
##<Daru::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
#                    a          b          c          d
#         5        bar        two          6         66

Parameters:

group (Array) —

The group that is to be selected from those grouped.

# File 'lib/daru/core/group_by.rb', line 258

def get_group group
  indexes   = groups_by_idx[group]
  elements  = @context.each_vector.map(&:to_a)
  transpose = elements.transpose
  rows      = indexes.each.map { |idx| transpose[idx] }

  Daru::DataFrame.rows(
    rows, index: indexes, order: @context.vectors
  )
end

#groups ⇒ `Object` Also known as: groups_by_idx

lazy accessor/attr_reader for the attribute groups



61
62
63

# File 'lib/daru/core/group_by.rb', line 61

def groups
  @groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
end

#head(quantity = 5) ⇒ `Object`

Get the top ‘n’ groups

Examples:

Usage of head

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).head(1)
# =>
# #<Daru::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          0        foo        one          1         11
#          7        foo      three          8         88
#          2        foo        two          3         33

Parameters:

quantity (Fixnum) (defaults to: 5) —

(5) The number of groups.



145
146
147

# File 'lib/daru/core/group_by.rb', line 145

def head quantity=5
  select_groups_from :first, quantity
end

#inspect ⇒ `Object`



302
303
304

# File 'lib/daru/core/group_by.rb', line 302

def inspect
  grouped_df.inspect
end

#last ⇒ `Object`

Get the last group



122
123
124

# File 'lib/daru/core/group_by.rb', line 122

def last
  tail(1)
end

#max ⇒ `Object`

Find the max element of each numeric vector group.



233
234
235

# File 'lib/daru/core/group_by.rb', line 233

def max
  apply_method :numeric, :max
end

#mean ⇒ `Object`

Calculate mean of numeric groups, excluding missing values.

Examples:

Usage of mean

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
df.group_by([:a, :b]).mean
# =>
# #<Daru::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
#                         c          d
# ["bar", "one"]          2         22
# ["bar", "three"]        1         44
# ["bar", "two"]          6         66
# ["foo", "one"]        2.0       44.0
# ["foo", "three"]        8         88
# ["foo", "two"]        3.0       44.0



189
190
191

# File 'lib/daru/core/group_by.rb', line 189

def mean
  apply_method :numeric, :mean
end

#median ⇒ `Object`

Calculate the median of numeric groups, excluding missing values.



194
195
196

# File 'lib/daru/core/group_by.rb', line 194

def median
  apply_method :numeric, :median
end

#min ⇒ `Object`

Find the min element of each numeric vector group.



238
239
240

# File 'lib/daru/core/group_by.rb', line 238

def min
  apply_method :numeric, :min
end

#reduce(init = nil) {|block| ... } ⇒ `Object`

Iteratively applies a function to the values in a group and accumulates the result.

Examples:

Usage of reduce

df = Daru::DataFrame.new({
  a: ['a','b'] * 3,
  b: [1,2,3] * 2,
  c: 'A'..'F'
})
df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
# =>
# #<Daru::Vector:70343147159900 @name = nil @size = 2 >
#     nil
#   a ACE
#   b BDF

Parameters:

init (nil) (defaults to: nil) —

The initial value of the accumulator.

Yield Parameters:

block (Proc) —

A proc or lambda that accepts two arguments. The first argument is the accumulated result. The second argument is a DataFrame row.

# File 'lib/daru/core/group_by.rb', line 285

def reduce(init=nil)
  result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
    group_indices = indices.map { |v| @context.index.to_a[v] }

    grouped_result = init
    group_indices.each do |idx|
      grouped_result = yield(grouped_result, @context.row[idx])
    end

    h[group] = grouped_result
  end

  index = get_grouped_index(result_hash.keys)

  Daru::Vector.new(result_hash.values, index: index)
end

#size ⇒ `Object`

Get a Daru::Vector of the size of each group.

# File 'lib/daru/core/group_by.rb', line 109

def size
  index = get_grouped_index

  values = @groups_by_pos.values.map(&:size)
  Daru::Vector.new(values, index: index, name: :size)
end

#std ⇒ `Object`

Calculate sample standard deviation of numeric vector groups, excluding missing values.



228
229
230

# File 'lib/daru/core/group_by.rb', line 228

def std
  apply_method :numeric, :std
end

#sum ⇒ `Object`

Calculate sum of numeric groups, excluding missing values.



199
200
201

# File 'lib/daru/core/group_by.rb', line 199

def sum
  apply_method :numeric, :sum
end

#tail(quantity = 5) ⇒ `Object`

Get the bottom ‘n’ groups

Examples:

Usage of tail

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
# df.group_by([:a, :b]).tail(1)
# =>
# #<Daru::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          6        foo        one          3         77
#          7        foo      three          8         88
#          4        foo        two          3         55

Parameters:

quantity (Fixnum) (defaults to: 5) —

(5) The number of groups.



168
169
170

# File 'lib/daru/core/group_by.rb', line 168

def tail quantity=5
  select_groups_from :last, quantity
end

Class: Daru::Core::GroupBy

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context, names) ⇒ GroupBy

Class Method Details

.df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object

.get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object

.get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object

.get_positions_group_map_on(indexes_with_positions, sort: false) ⇒ Object

.group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object

Instance Method Details

#aggregate(options = {}) ⇒ Daru::DataFrame

#count ⇒ Object

#df ⇒ Object Also known as: grouped_df

#each_group ⇒ Object

#first ⇒ Object

#get_group(group) ⇒ Object

#groups ⇒ Object Also known as: groups_by_idx

#head(quantity = 5) ⇒ Object

#inspect ⇒ Object

#last ⇒ Object

#max ⇒ Object

#mean ⇒ Object

#median ⇒ Object

#min ⇒ Object

#reduce(init = nil) {|block| ... } ⇒ Object

#size ⇒ Object

#std ⇒ Object

#sum ⇒ Object

#tail(quantity = 5) ⇒ Object

#initialize(context, names) ⇒ `GroupBy`

.df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ `Object`

.get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ `Object`

.get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ `Object`

.get_positions_group_map_on(indexes_with_positions, sort: false) ⇒ `Object`

.group_map_from_positions_to_indexes(positions_group_map, index) ⇒ `Object`

#aggregate(options = {}) ⇒ `Daru::DataFrame`

#count ⇒ `Object`

#df ⇒ `Object` Also known as: grouped_df

#each_group ⇒ `Object`

#first ⇒ `Object`

#get_group(group) ⇒ `Object`

#groups ⇒ `Object` Also known as: groups_by_idx

#head(quantity = 5) ⇒ `Object`

#inspect ⇒ `Object`

#last ⇒ `Object`

#max ⇒ `Object`

#mean ⇒ `Object`

#median ⇒ `Object`

#min ⇒ `Object`

#reduce(init = nil) {|block| ... } ⇒ `Object`

#size ⇒ `Object`

#std ⇒ `Object`

#sum ⇒ `Object`

#tail(quantity = 5) ⇒ `Object`