Class: Daru::Core::GroupBy

Inherits:
Object show all
Defined in:
lib/daru/core/group_by.rb

Constant Summary collapse

TUPLE_SORTER =
lambda do |left, right|
  return -1 unless right
  return 1 unless left

  left = left.compact
  right = right.compact
  return left <=> right || 0 if left.length == right.length
  left.length <=> right.length
end

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context, names) ⇒ GroupBy



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/daru/core/group_by.rb', line 24

def initialize context, names
  @groups = {}
  @non_group_vectors = context.vectors.to_a - names
  @context = context
  vectors = names.map { |vec| context[vec].to_a }
  tuples  = vectors[0].zip(*vectors[1..-1])
  # FIXME: It feels like we don't want to sort here. Ruby's #group_by
  # never sorts:
  #
  #   ['test', 'me', 'please'].group_by(&:size)
  #   #  => {4=>["test"], 2=>["me"], 6=>["please"]}
  #
  # - zverok, 2016-09-12
  init_groups_df tuples, names
end

Instance Attribute Details

#dfObject (readonly)

Returns the value of attribute df.



4
5
6
# File 'lib/daru/core/group_by.rb', line 4

def df
  @df
end

#groupsObject (readonly)

Returns the value of attribute groups.



4
5
6
# File 'lib/daru/core/group_by.rb', line 4

def groups
  @groups
end

Instance Method Details

#aggregate(options = {}) ⇒ Daru::DataFrame

Function to use for aggregating the data. ‘group_by` is using Daru::DataFrame#aggregate

Examples:


df = Daru::DataFrame.new(
  name: ['Ram','Krishna','Ram','Krishna','Krishna'],
  visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])

=> #<Daru::DataFrame(5x2)>
                name   visited
         0       Ram Hyderabad
         1   Krishna     Delhi
         2       Ram    Mumbai
         3   Krishna    Raipur
         4   Krishna  Banglore

df.group_by(:name)
=> #<Daru::DataFrame(5x1)>
                       visited
   Krishna         1     Delhi
                   3    Raipur
                   4  Banglore
       Ram         0 Hyderabad
                   2    Mumbai

df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
=> #<Daru::DataFrame(2x1)>
               visited
    Krishna Delhi,Raipur,Banglore
        Ram Hyderabad,Mumbai


284
285
286
287
# File 'lib/daru/core/group_by.rb', line 284

def aggregate(options={})
  @df.index = @df.index.remove_layer(@df.index.levels.size - 1)
  @df.aggregate(options)
end

#countObject

Count groups, excludes missing values.

Examples:

Using count

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).count
# =>
# #<Daru::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
#                           c          d
# ["bar", "one"]            1          1
# ["bar", "two"]            1          1
# ["bar", "three"]          1          1
# ["foo", "one"]            2          2
# ["foo", "three"]          1          1
# ["foo", "two"]            2          2


158
159
160
161
# File 'lib/daru/core/group_by.rb', line 158

def count
  width = @non_group_vectors.size
  Daru::DataFrame.new([size]*width, order: @non_group_vectors)
end

#each_groupObject

Iterate over each group created by group_by. A DataFrame is yielded in block.



8
9
10
11
12
# File 'lib/daru/core/group_by.rb', line 8

def each_group
  groups.keys.each do |k|
    yield get_group(k)
  end
end

#firstObject

Get the first group



54
55
56
# File 'lib/daru/core/group_by.rb', line 54

def first
  head(1)
end

#get_group(group) ⇒ Object

Returns one of the selected groups as a DataFrame.

Examples:

Getting a group


df = Daru::DataFrame.new({
      a: %w{foo bar foo bar   foo bar foo foo},
      b: %w{one one two three two two one three},
      c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
      d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
    })
df.group_by([:a, :b]).get_group ['bar','two']
#=>
##<Daru::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
#                    a          b          c          d
#         5        bar        two          6         66


195
196
197
198
199
200
201
202
203
204
# File 'lib/daru/core/group_by.rb', line 195

def get_group group
  indexes   = @groups[group]
  elements  = @context.each_vector.map(&:to_a)
  transpose = elements.transpose
  rows      = indexes.each.map { |idx| transpose[idx] }

  Daru::DataFrame.rows(
    rows, index: indexes, order: @context.vectors
  )
end

#head(quantity = 5) ⇒ Object

Get the top ‘n’ groups

Examples:

Usage of head

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).head(1)
# =>
# #<Daru::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          0        foo        one          1         11
#          7        foo      three          8         88
#          2        foo        two          3         33


82
83
84
# File 'lib/daru/core/group_by.rb', line 82

def head quantity=5
  select_groups_from :first, quantity
end

#inspectObject



244
245
246
# File 'lib/daru/core/group_by.rb', line 244

def inspect
  @df.inspect
end

#lastObject

Get the last group



59
60
61
# File 'lib/daru/core/group_by.rb', line 59

def last
  tail(1)
end

#maxObject

Find the max element of each numeric vector group.



170
171
172
# File 'lib/daru/core/group_by.rb', line 170

def max
  apply_method :numeric, :max
end

#meanObject

Calculate mean of numeric groups, excluding missing values.

Examples:

Usage of mean

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
df.group_by([:a, :b]).mean
# =>
# #<Daru::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
#                         c          d
# ["bar", "one"]          2         22
# ["bar", "three"]        1         44
# ["bar", "two"]          6         66
# ["foo", "one"]        2.0       44.0
# ["foo", "three"]        8         88
# ["foo", "two"]        3.0       44.0


126
127
128
# File 'lib/daru/core/group_by.rb', line 126

def mean
  apply_method :numeric, :mean
end

#medianObject

Calculate the median of numeric groups, excluding missing values.



131
132
133
# File 'lib/daru/core/group_by.rb', line 131

def median
  apply_method :numeric, :median
end

#minObject

Find the min element of each numeric vector group.



175
176
177
# File 'lib/daru/core/group_by.rb', line 175

def min
  apply_method :numeric, :min
end

#reduce(init = nil) {|block| ... } ⇒ Object

Iteratively applies a function to the values in a group and accumulates the result.

Examples:

Usage of reduce

df = Daru::DataFrame.new({
  a: ['a','b'] * 3,
  b: [1,2,3] * 2,
  c: 'A'..'F'
})
df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
# =>
# #<Daru::Vector:70343147159900 @name = nil @size = 2 >
#     nil
#   a ACE
#   b BDF

Yield Parameters:

  • block (Proc)

    A proc or lambda that accepts two arguments. The first argument is the accumulated result. The second argument is a DataFrame row.



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/daru/core/group_by.rb', line 222

def reduce(init=nil)
  result_hash = @groups.each_with_object({}) do |(group, indices), h|
    group_indices = indices.map { |v| @context.index.to_a[v] }

    grouped_result = init
    group_indices.each do |idx|
      grouped_result = yield(grouped_result, @context.row[idx])
    end

    h[group] = grouped_result
  end

  index =
    if multi_indexed_grouping?
      Daru::MultiIndex.from_tuples result_hash.keys
    else
      Daru::Index.new result_hash.keys.flatten
    end

  Daru::Vector.new(result_hash.values, index: index)
end

#sizeObject

Get a Daru::Vector of the size of each group.



41
42
43
44
45
46
47
48
49
50
51
# File 'lib/daru/core/group_by.rb', line 41

def size
  index =
    if multi_indexed_grouping?
      Daru::MultiIndex.from_tuples @groups.keys
    else
      Daru::Index.new @groups.keys.flatten
    end

  values = @groups.values.map(&:size)
  Daru::Vector.new(values, index: index, name: :size)
end

#stdObject

Calculate sample standard deviation of numeric vector groups, excluding missing values.



165
166
167
# File 'lib/daru/core/group_by.rb', line 165

def std
  apply_method :numeric, :std
end

#sumObject

Calculate sum of numeric groups, excluding missing values.



136
137
138
# File 'lib/daru/core/group_by.rb', line 136

def sum
  apply_method :numeric, :sum
end

#tail(quantity = 5) ⇒ Object

Get the bottom ‘n’ groups

Examples:

Usage of tail

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
# df.group_by([:a, :b]).tail(1)
# =>
# #<Daru::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          6        foo        one          3         77
#          7        foo      three          8         88
#          4        foo        two          3         55


105
106
107
# File 'lib/daru/core/group_by.rb', line 105

def tail quantity=5
  select_groups_from :last, quantity
end