Class: Daru::Core::GroupBy

Inherits:
Object show all
Defined in:
lib/daru/core/group_by.rb

Constant Summary collapse

TUPLE_SORTER =
lambda do |a, b|
  if a && b
    a.compact <=> b.compact
  else
    a ? 1 : -1
  end
end

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context, names) ⇒ GroupBy

Returns a new instance of GroupBy.



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/daru/core/group_by.rb', line 22

def initialize context, names
  @groups = {}
  @non_group_vectors = context.vectors.to_a - names
  @context = context
  vectors = names.map { |vec| context[vec].to_a }
  tuples  = vectors[0].zip(*vectors[1..-1])
  # FIXME: It feels like we don't want to sort here. Ruby's #group_by
  # never sorts:
  #
  #   ['test', 'me', 'please'].group_by(&:size)
  #   #  => {4=>["test"], 2=>["me"], 6=>["please"]}
  #
  # - zverok, 2016-09-12
  keys    = tuples.uniq.sort(&TUPLE_SORTER)

  keys.each do |key|
    @groups[key] = all_indices_for(tuples, key)
  end
  @groups.freeze
end

Instance Attribute Details

#groupsObject (readonly)

Returns the value of attribute groups.



4
5
6
# File 'lib/daru/core/group_by.rb', line 4

def groups
  @groups
end

Instance Method Details

#countObject

Count groups, excludes missing values.

Examples:

Using count

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).count
# =>
# #<Daru::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
#                           c          d
# ["bar", "one"]            1          1
# ["bar", "two"]            1          1
# ["bar", "three"]          1          1
# ["foo", "one"]            2          2
# ["foo", "three"]          1          1
# ["foo", "two"]            2          2


161
162
163
164
# File 'lib/daru/core/group_by.rb', line 161

def count
  width = @non_group_vectors.size
  Daru::DataFrame.new([size]*width, order: @non_group_vectors)
end

#each_groupObject

Iterate over each group created by group_by. A DataFrame is yielded in block.



8
9
10
11
12
# File 'lib/daru/core/group_by.rb', line 8

def each_group
  groups.keys.each do |k|
    yield get_group(k)
  end
end

#firstObject

Get the first group



57
58
59
# File 'lib/daru/core/group_by.rb', line 57

def first
  head(1)
end

#get_group(group) ⇒ Object

Returns one of the selected groups as a DataFrame.

Examples:

Getting a group


df = Daru::DataFrame.new({
      a: %w{foo bar foo bar   foo bar foo foo},
      b: %w{one one two three two two one three},
      c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
      d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
    })
df.group_by([:a, :b]).get_group ['bar','two']
#=>
##<Daru::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
#                    a          b          c          d
#         5        bar        two          6         66

Parameters:

  • group (Array)

    The group that is to be selected from those grouped.



198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/daru/core/group_by.rb', line 198

def get_group group
  indexes   = @groups[group]
  elements  = @context.each_vector.map(&:to_a)
  transpose = elements.transpose
  rows      = indexes.each.map { |idx| transpose[idx] }

  new_index =
    begin
      @context.index[indexes]
    rescue IndexError
      indexes
    end

  Daru::DataFrame.rows(
    rows, index: new_index, order: @context.vectors
  )
end

#head(quantity = 5) ⇒ Object

Get the top ‘n’ groups

Examples:

Usage of head

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
df.group_by([:a, :b]).head(1)
# =>
# #<Daru::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          0        foo        one          1         11
#          7        foo      three          8         88
#          2        foo        two          3         33

Parameters:

  • quantity (Fixnum) (defaults to: 5)

    (5) The number of groups.



85
86
87
# File 'lib/daru/core/group_by.rb', line 85

def head quantity=5
  select_groups_from :first, quantity
end

#lastObject

Get the last group



62
63
64
# File 'lib/daru/core/group_by.rb', line 62

def last
  tail(1)
end

#maxObject

Find the max element of each numeric vector group.



173
174
175
# File 'lib/daru/core/group_by.rb', line 173

def max
  apply_method :numeric, :max
end

#meanObject

Calculate mean of numeric groups, excluding missing values.

Examples:

Usage of mean

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
df.group_by([:a, :b]).mean
# =>
# #<Daru::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
#                         c          d
# ["bar", "one"]          2         22
# ["bar", "three"]        1         44
# ["bar", "two"]          6         66
# ["foo", "one"]        2.0       44.0
# ["foo", "three"]        8         88
# ["foo", "two"]        3.0       44.0


129
130
131
# File 'lib/daru/core/group_by.rb', line 129

def mean
  apply_method :numeric, :mean
end

#medianObject

Calculate the median of numeric groups, excluding missing values.



134
135
136
# File 'lib/daru/core/group_by.rb', line 134

def median
  apply_method :numeric, :median
end

#minObject

Find the min element of each numeric vector group.



178
179
180
# File 'lib/daru/core/group_by.rb', line 178

def min
  apply_method :numeric, :min
end

#reduce(init = nil) ⇒ Object

Iteratively applies a function to the values in a group and accumulates the result.

Examples:

Usage of reduce

df = Daru::DataFrame.new({
  a: ['a','b'] * 3,
  b: [1,2,3] * 2,
  c: 'A'..'F'
})
df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
# =>
# #<Daru::Vector:70343147159900 @name = nil @size = 2 >
#     nil
#   a ACE
#   b BDF

Parameters:

  • init (nil) (defaults to: nil)

    The initial value of the accumulator.

  • block (Proc)

    A proc or lambda that accepts two arguments. The first argument is the accumulated result. The second argument is a DataFrame row.



232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/daru/core/group_by.rb', line 232

def reduce(init=nil)
  result_hash = @groups.each_with_object({}) do |(group, indices), h|
    group_indices = indices.map { |v| @context.index.to_a[v] }

    grouped_result = init
    group_indices.each do |idx|
      grouped_result = yield(grouped_result, @context.row[idx])
    end

    h[group] = grouped_result
  end

  index =
    if multi_indexed_grouping?
      Daru::MultiIndex.from_tuples result_hash.keys
    else
      Daru::Index.new result_hash.keys.flatten
    end

  Daru::Vector.new(result_hash.values, index: index)
end

#sizeObject

Get a Daru::Vector of the size of each group.



44
45
46
47
48
49
50
51
52
53
54
# File 'lib/daru/core/group_by.rb', line 44

def size
  index =
    if multi_indexed_grouping?
      Daru::MultiIndex.from_tuples @groups.keys
    else
      Daru::Index.new @groups.keys.flatten
    end

  values = @groups.values.map(&:size)
  Daru::Vector.new(values, index: index, name: :size)
end

#stdObject

Calculate sample standard deviation of numeric vector groups, excluding missing values.



168
169
170
# File 'lib/daru/core/group_by.rb', line 168

def std
  apply_method :numeric, :std
end

#sumObject

Calculate sum of numeric groups, excluding missing values.



139
140
141
# File 'lib/daru/core/group_by.rb', line 139

def sum
  apply_method :numeric, :sum
end

#tail(quantity = 5) ⇒ Object

Get the bottom ‘n’ groups

Examples:

Usage of tail

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
# df.group_by([:a, :b]).tail(1)
# =>
# #<Daru::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
#                     a          b          c          d
#          1        bar        one          2         22
#          3        bar      three          1         44
#          5        bar        two          6         66
#          6        foo        one          3         77
#          7        foo      three          8         88
#          4        foo        two          3         55

Parameters:

  • quantity (Fixnum) (defaults to: 5)

    (5) The number of groups.



108
109
110
# File 'lib/daru/core/group_by.rb', line 108

def tail quantity=5
  select_groups_from :last, quantity
end