Class: Daru::Core::GroupBy

Inherits:
Object show all
Extended by:
Gem::Deprecate
Defined in:
lib/daru/core/group_by.rb

Constant Summary collapse

TUPLE_SORTER =
lambda do |left, right|
  return -1 unless right
  return 1 unless left

  left = left.compact
  right = right.compact
  return left <=> right || 0 if left.length == right.length
  left.length <=> right.length
end

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context, names) ⇒ GroupBy

Returns a new instance of GroupBy.



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/daru/core/group_by.rb', line 99

def initialize context, names
  @group_vectors     = names
  @non_group_vectors = context.vectors.to_a - names

  @context = context # TODO: maybe rename in @original_df

  # FIXME: It feels like we don't want to sort here. Ruby's #group_by
  # never sorts:
  #
  #   ['test', 'me', 'please'].group_by(&:size)
  #   #  => {4=>["test"], 2=>["me"], 6=>["please"]}
  #
  # - zverok, 2016-09-12
  @groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
end

Instance Attribute Details

#group_vectorsObject (readonly)

The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors



65
66
67
# File 'lib/daru/core/group_by.rb', line 65

def group_vectors
  @group_vectors
end

#non_group_vectorsObject (readonly)

The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors



65
66
67
# File 'lib/daru/core/group_by.rb', line 65

def non_group_vectors
  @non_group_vectors
end

Class Method Details

.df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/daru/core/group_by.rb', line 48

def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
  return nil if group_map == {}

  new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
  new_index = Daru::MultiIndex.from_tuples(new_index)

  return Daru::DataFrame.new({}, index: new_index) if remaining_vectors == []

  new_rows_order = group_map.values.flatten
  new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
  new_df.index = new_index

  new_df
end

.get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object



26
27
28
29
30
31
32
33
# File 'lib/daru/core/group_by.rb', line 26

def get_positions_group_for_aggregation(multi_index, level=-1)
  raise unless multi_index.is_a?(Daru::MultiIndex)

  new_index = multi_index.dup
  new_index.remove_layer(level) # TODO: recheck code of Daru::MultiIndex#remove_layer

  group_by_index_to_positions(new_index.each_with_index)
end

.get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object



36
37
38
39
40
# File 'lib/daru/core/group_by.rb', line 36

def get_positions_group_map_for_df(df, group_by_keys, sort: true)
  indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index

  group_by_index_to_positions(indexes_with_positions, sort: sort)
end

.group_by_index_to_positions(indexes_with_positions, sort: false) ⇒ Object Also known as: get_positions_group_map_on



8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/daru/core/group_by.rb', line 8

def group_by_index_to_positions(indexes_with_positions, sort: false)
  index_to_positions = {}

  indexes_with_positions.each do |idx, position|
    (index_to_positions[idx] ||= []) << position
  end

  if sort # TODO: maybe add a more "stable" sorting option?
    sorted_keys = index_to_positions.keys.sort(&Daru::Core::GroupBy::TUPLE_SORTER)
    index_to_positions = sorted_keys.map { |k| [k, index_to_positions[k]] }.to_h
  end

  index_to_positions
end

.group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object



43
44
45
# File 'lib/daru/core/group_by.rb', line 43

def group_map_from_positions_to_indexes(positions_group_map, index)
  positions_group_map.map { |k, positions| [k, positions.map { |pos| index.at(pos) }] }.to_h
end

Instance Method Details

#aggregate(options = {}) ⇒ Daru::DataFrame

Function to use for aggregating the data. ‘group_by` is using Daru::DataFrame#aggregate

Examples:


df = Daru::DataFrame.new(
  name: ['Ram','Krishna','Ram','Krishna','Krishna'],
  visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])

=> #<Daru::DataFrame(5x2)>
                name   visited
         0       Ram Hyderabad
         1   Krishna     Delhi
         2       Ram    Mumbai
         3   Krishna    Raipur
         4   Krishna  Banglore

df.group_by(:name)
=> #<Daru::DataFrame(5x1)>
                       visited
   Krishna         1     Delhi
                   3    Raipur
                   4  Banglore
       Ram         0 Hyderabad
                   2    Mumbai

df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
=> #<Daru::DataFrame(2x1)>
               visited
    Krishna Delhi,Raipur,Banglore
        Ram Hyderabad,Mumbai

Parameters:

  • options (Hash) (defaults to: {})

    options for column, you want in resultant dataframe

Returns:



349
350
351
352
353
# File 'lib/daru/core/group_by.rb', line 349

def aggregate(options={})
  new_index = get_grouped_index

  @context.aggregate(options) { [@groups_by_pos.values, new_index] }
end

#countObject

Count groups, excludes missing values.

Examples:

Using count

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).count
# =>
# #<Daru::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
#                           c          d
# ["bar", "one"]            1          1
# ["bar", "two"]            1          1
# ["bar", "three"]          1          1
# ["foo", "one"]            2          2
# ["foo", "three"]          1          1
# ["foo", "two"]            2          2


228
229
230
231
# File 'lib/daru/core/group_by.rb', line 228

def count
  width = @non_group_vectors.size
  Daru::DataFrame.new([size]*width, order: @non_group_vectors)
end

#dfObject Also known as: grouped_df

lazy accessor/attr_reader for the attribute df



74
75
76
# File 'lib/daru/core/group_by.rb', line 74

def df
  @df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
end

#each_groupObject

Iterate over each group created by group_by. A DataFrame is yielded in block.



81
82
83
84
85
86
87
# File 'lib/daru/core/group_by.rb', line 81

def each_group
  return to_enum(:each_group) unless block_given?

  groups.keys.each do |k|
    yield get_group(k)
  end
end

#firstObject

Get the first group



124
125
126
# File 'lib/daru/core/group_by.rb', line 124

def first
  head(1)
end

#get_group(group) ⇒ Object

Returns one of the selected groups as a DataFrame.

Examples:

Getting a group


df = Daru::DataFrame.new({
      a: %w{foo bar foo bar   foo bar foo foo},
      b: %w{one one two three two two one three},
      c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
      d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
    })
df.group_by([:a, :b]).get_group ['bar','two']
#=>
##<Daru::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
#                    a          b          c          d
#         5        bar        two          6         66

Parameters:

  • group (Array)

    The group that is to be selected from those grouped.



265
266
267
268
269
270
271
272
273
274
# File 'lib/daru/core/group_by.rb', line 265

def get_group group
  indexes   = groups_by_idx[group]
  elements  = @context.each_vector.map(&:to_a)
  transpose = elements.transpose
  rows      = indexes.each.map { |idx| transpose[idx] }

  Daru::DataFrame.rows(
    rows, index: indexes, order: @context.vectors
  )
end

#groupsObject Also known as: groups_by_idx

lazy accessor/attr_reader for the attribute groups



68
69
70
# File 'lib/daru/core/group_by.rb', line 68

def groups
  @groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
end

#head(quantity = 5) ⇒ Object

Get the top ‘n’ groups

Examples:

Usage of head

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).head(1)
# =>
# #<Daru::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          0        foo        one          1         11
#          7        foo      three          8         88
#          2        foo        two          3         33

Parameters:

  • quantity (Fixnum) (defaults to: 5)

    (5) The number of groups.



152
153
154
# File 'lib/daru/core/group_by.rb', line 152

def head quantity=5
  select_groups_from :first, quantity
end

#inspectObject



309
310
311
# File 'lib/daru/core/group_by.rb', line 309

def inspect
  grouped_df.inspect
end

#lastObject

Get the last group



129
130
131
# File 'lib/daru/core/group_by.rb', line 129

def last
  tail(1)
end

#maxObject

Find the max element of each numeric vector group.



240
241
242
# File 'lib/daru/core/group_by.rb', line 240

def max
  apply_method :numeric, :max
end

#meanObject

Calculate mean of numeric groups, excluding missing values.

Examples:

Usage of mean

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
df.group_by([:a, :b]).mean
# =>
# #<Daru::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
#                         c          d
# ["bar", "one"]          2         22
# ["bar", "three"]        1         44
# ["bar", "two"]          6         66
# ["foo", "one"]        2.0       44.0
# ["foo", "three"]        8         88
# ["foo", "two"]        3.0       44.0


196
197
198
# File 'lib/daru/core/group_by.rb', line 196

def mean
  apply_method :numeric, :mean
end

#medianObject

Calculate the median of numeric groups, excluding missing values.



201
202
203
# File 'lib/daru/core/group_by.rb', line 201

def median
  apply_method :numeric, :median
end

#minObject

Find the min element of each numeric vector group.



245
246
247
# File 'lib/daru/core/group_by.rb', line 245

def min
  apply_method :numeric, :min
end

#reduce(init = nil) {|block| ... } ⇒ Object

Iteratively applies a function to the values in a group and accumulates the result.

Examples:

Usage of reduce

df = Daru::DataFrame.new({
  a: ['a','b'] * 3,
  b: [1,2,3] * 2,
  c: 'A'..'F'
})
df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
# =>
# #<Daru::Vector:70343147159900 @name = nil @size = 2 >
#     nil
#   a ACE
#   b BDF

Parameters:

  • init (nil) (defaults to: nil)

    The initial value of the accumulator.

Yield Parameters:

  • block (Proc)

    A proc or lambda that accepts two arguments. The first argument is the accumulated result. The second argument is a DataFrame row.



292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/daru/core/group_by.rb', line 292

def reduce(init=nil)
  result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
    group_indices = indices.map { |v| @context.index.to_a[v] }

    grouped_result = init
    group_indices.each do |idx|
      grouped_result = yield(grouped_result, @context.row[idx])
    end

    h[group] = grouped_result
  end

  index = get_grouped_index(result_hash.keys)

  Daru::Vector.new(result_hash.values, index: index)
end

#sizeObject

Get a Daru::Vector of the size of each group.



116
117
118
119
120
121
# File 'lib/daru/core/group_by.rb', line 116

def size
  index = get_grouped_index

  values = @groups_by_pos.values.map(&:size)
  Daru::Vector.new(values, index: index, name: :size)
end

#stdObject

Calculate sample standard deviation of numeric vector groups, excluding missing values.



235
236
237
# File 'lib/daru/core/group_by.rb', line 235

def std
  apply_method :numeric, :std
end

#sumObject

Calculate sum of numeric groups, excluding missing values.



206
207
208
# File 'lib/daru/core/group_by.rb', line 206

def sum
  apply_method :numeric, :sum
end

#tail(quantity = 5) ⇒ Object

Get the bottom ‘n’ groups

Examples:

Usage of tail

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
# df.group_by([:a, :b]).tail(1)
# =>
# #<Daru::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          6        foo        one          3         77
#          7        foo      three          8         88
#          4        foo        two          3         55

Parameters:

  • quantity (Fixnum) (defaults to: 5)

    (5) The number of groups.



175
176
177
# File 'lib/daru/core/group_by.rb', line 175

def tail quantity=5
  select_groups_from :last, quantity
end