Class: Daru::Core::GroupBy

Inherits:
Object show all
Defined in:
lib/daru/core/group_by.rb

Constant Summary collapse

TUPLE_SORTER =
lambda do |a, b|
  if a && b
    a.compact <=> b.compact
  else
    a ? 1 : -1
  end
end

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context, names) ⇒ GroupBy

Returns a new instance of GroupBy.



22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/daru/core/group_by.rb', line 22

def initialize context, names
  @groups = {}
  @non_group_vectors = context.vectors.to_a - names
  @context = context
  vectors = names.map { |vec| context[vec].to_a }
  tuples  = vectors[0].zip(*vectors[1..-1])
  keys    = tuples.uniq.sort(&TUPLE_SORTER)

  keys.each do |key|
    @groups[key] = all_indices_for(tuples, key)
  end
  @groups.freeze
end

Instance Attribute Details

#groupsObject (readonly)

Returns the value of attribute groups.



4
5
6
# File 'lib/daru/core/group_by.rb', line 4

def groups
  @groups
end

Instance Method Details

#countObject

Count groups, excludes missing values.

Examples:

Using count

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).count
# =>
# #<Daru::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
#                           c          d
# ["bar", "one"]            1          1
# ["bar", "two"]            1          1
# ["bar", "three"]          1          1
# ["foo", "one"]            2          2
# ["foo", "three"]          1          1
# ["foo", "two"]            2          2


154
155
156
157
# File 'lib/daru/core/group_by.rb', line 154

def count
  width = @non_group_vectors.size
  Daru::DataFrame.new([size]*width, order: @non_group_vectors)
end

#each_groupObject

Iterate over each group created by group_by. A DataFrame is yielded in block.



8
9
10
11
12
# File 'lib/daru/core/group_by.rb', line 8

def each_group
  groups.keys.each do |k|
    yield get_group(k)
  end
end

#firstObject

Get the first group



50
51
52
# File 'lib/daru/core/group_by.rb', line 50

def first
  head(1)
end

#get_group(group) ⇒ Object

Returns one of the selected groups as a DataFrame.

Examples:

Getting a group


df = Daru::DataFrame.new({
      a: %w{foo bar foo bar   foo bar foo foo},
      b: %w{one one two three two two one three},
      c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
      d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
    })
df.group_by([:a, :b]).get_group ['bar','two']
#=>
##<Daru::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
#                    a          b          c          d
#         5        bar        two          6         66

Parameters:

  • group (Array)

    The group that is to be selected from those grouped.



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/daru/core/group_by.rb', line 191

def get_group group
  indexes   = @groups[group]
  elements  = @context.each_vector.map(&:to_a)
  transpose = elements.transpose
  rows      = indexes.each.map { |idx| transpose[idx] }

  new_index =
    begin
      @context.index[indexes]
    rescue IndexError
      indexes
    end

  Daru::DataFrame.rows(
    rows, index: new_index, order: @context.vectors
  )
end

#head(quantity = 5) ⇒ Object

Get the top ‘n’ groups

Examples:

Usage of head

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).head(1)
# =>
# #<Daru::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          0        foo        one          1         11
#          7        foo      three          8         88
#          2        foo        two          3         33

Parameters:

  • quantity (Fixnum) (defaults to: 5)

    (5) The number of groups.



78
79
80
# File 'lib/daru/core/group_by.rb', line 78

def head quantity=5
  select_groups_from :first, quantity
end

#lastObject

Get the last group



55
56
57
# File 'lib/daru/core/group_by.rb', line 55

def last
  tail(1)
end

#maxObject

Find the max element of each numeric vector group.



166
167
168
# File 'lib/daru/core/group_by.rb', line 166

def max
  apply_method :numeric, :max
end

#meanObject

Calculate mean of numeric groups, excluding missing values.

Examples:

Usage of mean

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
df.group_by([:a, :b]).mean
# =>
# #<Daru::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
#                         c          d
# ["bar", "one"]          2         22
# ["bar", "three"]        1         44
# ["bar", "two"]          6         66
# ["foo", "one"]        2.0       44.0
# ["foo", "three"]        8         88
# ["foo", "two"]        3.0       44.0


122
123
124
# File 'lib/daru/core/group_by.rb', line 122

def mean
  apply_method :numeric, :mean
end

#medianObject

Calculate the median of numeric groups, excluding missing values.



127
128
129
# File 'lib/daru/core/group_by.rb', line 127

def median
  apply_method :numeric, :median
end

#minObject

Find the min element of each numeric vector group.



171
172
173
# File 'lib/daru/core/group_by.rb', line 171

def min
  apply_method :numeric, :min
end

#reduce(init = nil) ⇒ Object

Iteratively applies a function to the values in a group and accumulates the result.

Examples:

Usage of reduce

df = Daru::DataFrame.new({
  a: ['a','b'] * 3,
  b: [1,2,3] * 2,
  c: 'A'..'F'
})
df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
# =>
# #<Daru::Vector:70343147159900 @name = nil @size = 2 >
#     nil
#   a ACE
#   b BDF

Parameters:

  • init (nil) (defaults to: nil)

    The initial value of the accumulator.

  • block (Proc)

    A proc or lambda that accepts two arguments. The first argument is the accumulated result. The second argument is a DataFrame row.



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/daru/core/group_by.rb', line 225

def reduce(init=nil)
  result_hash = @groups.each_with_object({}) do |(group, indices), h|
    group_indices = indices.map { |v| @context.index.to_a[v] }

    grouped_result = init
    group_indices.each do |idx|
      grouped_result = yield(grouped_result, @context.row[idx])
    end

    h[group] = grouped_result
  end

  index =
    if multi_indexed_grouping?
      Daru::MultiIndex.from_tuples result_hash.keys
    else
      Daru::Index.new result_hash.keys.flatten
    end

  Daru::Vector.new(result_hash.values, index: index)
end

#sizeObject

Get a Daru::Vector of the size of each group.



37
38
39
40
41
42
43
44
45
46
47
# File 'lib/daru/core/group_by.rb', line 37

def size
  index =
    if multi_indexed_grouping?
      Daru::MultiIndex.from_tuples @groups.keys
    else
      Daru::Index.new @groups.keys.flatten
    end

  values = @groups.values.map(&:size)
  Daru::Vector.new(values, index: index, name: :size)
end

#stdObject

Calculate sample standard deviation of numeric vector groups, excluding missing values.



161
162
163
# File 'lib/daru/core/group_by.rb', line 161

def std
  apply_method :numeric, :std
end

#sumObject

Calculate sum of numeric groups, excluding missing values.



132
133
134
# File 'lib/daru/core/group_by.rb', line 132

def sum
  apply_method :numeric, :sum
end

#tail(quantity = 5) ⇒ Object

Get the bottom ‘n’ groups

Examples:

Usage of tail

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
# df.group_by([:a, :b]).tail(1)
# =>
# #<Daru::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          6        foo        one          3         77
#          7        foo      three          8         88
#          4        foo        two          3         55

Parameters:

  • quantity (Fixnum) (defaults to: 5)

    (5) The number of groups.



101
102
103
# File 'lib/daru/core/group_by.rb', line 101

def tail quantity=5
  select_groups_from :last, quantity
end