Class: Daru::Core::GroupBy
Constant Summary collapse
- TUPLE_SORTER =
lambda do |left, right| return -1 unless right return 1 unless left left = left.compact right = right.compact return left <=> right || 0 if left.length == right.length left.length <=> right.length end
Class Method Summary collapse
- .df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object
- .get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object
- .get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object
- .get_positions_group_map_on(indexes_with_positions, sort: false) ⇒ Object
- .group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object
Instance Method Summary collapse
-
#aggregate(options = {}) ⇒ Daru::DataFrame
Function to use for aggregating the data.
-
#count ⇒ Object
Count groups, excludes missing values.
-
#df ⇒ Object
(also: #grouped_df)
lazy accessor/attr_reader for the attribute df.
-
#each_group ⇒ Object
Iterate over each group created by group_by.
-
#first ⇒ Object
Get the first group.
-
#get_group(group) ⇒ Object
Returns one of the selected groups as a DataFrame.
-
#groups ⇒ Object
(also: #groups_by_idx)
lazy accessor/attr_reader for the attribute groups.
-
#head(quantity = 5) ⇒ Object
Get the top ‘n’ groups.
-
#initialize(context, names) ⇒ GroupBy
constructor
A new instance of GroupBy.
- #inspect ⇒ Object
-
#last ⇒ Object
Get the last group.
-
#max ⇒ Object
Find the max element of each numeric vector group.
-
#mean ⇒ Object
Calculate mean of numeric groups, excluding missing values.
-
#median ⇒ Object
Calculate the median of numeric groups, excluding missing values.
-
#min ⇒ Object
Find the min element of each numeric vector group.
-
#reduce(init = nil) {|block| ... } ⇒ Object
Iteratively applies a function to the values in a group and accumulates the result.
-
#size ⇒ Object
Get a Daru::Vector of the size of each group.
-
#std ⇒ Object
Calculate sample standard deviation of numeric vector groups, excluding missing values.
-
#sum ⇒ Object
Calculate sum of numeric groups, excluding missing values.
-
#tail(quantity = 5) ⇒ Object
Get the bottom ‘n’ groups.
Constructor Details
#initialize(context, names) ⇒ GroupBy
Returns a new instance of GroupBy.
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/daru/core/group_by.rb', line 92 def initialize context, names @group_vectors = names @non_group_vectors = context.vectors.to_a - names @context = context # TODO: maybe rename in @original_df or @grouped_db # FIXME: It feels like we don't want to sort here. Ruby's #group_by # never sorts: # # ['test', 'me', 'please'].group_by(&:size) # # => {4=>["test"], 2=>["me"], 6=>["please"]} # # - zverok, 2016-09-12 @groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true) end |
Class Method Details
.df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object
44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/daru/core/group_by.rb', line 44 def df_from_group_map(df, group_map, remaining_vectors, from_position: true) return nil if group_map == {} new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } } new_index = Daru::MultiIndex.from_tuples(new_index) return Daru::DataFrame.new({}, index: new_index) if remaining_vectors == [] new_rows_order = group_map.values.flatten new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position) new_df.index = new_index new_df end |
.get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object
22 23 24 25 26 27 28 29 |
# File 'lib/daru/core/group_by.rb', line 22 def get_positions_group_for_aggregation(multi_index, level=-1) raise unless multi_index.is_a?(Daru::MultiIndex) new_index = multi_index.dup new_index.remove_layer(level) # TODO: recheck code of Daru::MultiIndex#remove_layer get_positions_group_map_on(new_index.each_with_index) end |
.get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object
32 33 34 35 36 |
# File 'lib/daru/core/group_by.rb', line 32 def get_positions_group_map_for_df(df, group_by_keys, sort: true) indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index get_positions_group_map_on(indexes_with_positions, sort: sort) end |
.get_positions_group_map_on(indexes_with_positions, sort: false) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/daru/core/group_by.rb', line 6 def get_positions_group_map_on(indexes_with_positions, sort: false) group_map = {} indexes_with_positions.each do |idx, position| (group_map[idx] ||= []) << position end if sort # TODO: maybe add a more "stable" sorting option? sorted_keys = group_map.keys.sort(&Daru::Core::GroupBy::TUPLE_SORTER) group_map = sorted_keys.map { |k| [k, group_map[k]] }.to_h end group_map end |
.group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object
39 40 41 |
# File 'lib/daru/core/group_by.rb', line 39 def group_map_from_positions_to_indexes(positions_group_map, index) positions_group_map.map { |k, positions| [k, positions.map { |pos| index.at(pos) }] }.to_h end |
Instance Method Details
#aggregate(options = {}) ⇒ Daru::DataFrame
Function to use for aggregating the data. ‘group_by` is using Daru::DataFrame#aggregate
342 343 344 345 346 |
# File 'lib/daru/core/group_by.rb', line 342 def aggregate(={}) new_index = get_grouped_index @context.aggregate() { [@groups_by_pos.values, new_index] } end |
#count ⇒ Object
Count groups, excludes missing values.
221 222 223 224 |
# File 'lib/daru/core/group_by.rb', line 221 def count width = @non_group_vectors.size Daru::DataFrame.new([size]*width, order: @non_group_vectors) end |
#df ⇒ Object Also known as: grouped_df
lazy accessor/attr_reader for the attribute df
67 68 69 |
# File 'lib/daru/core/group_by.rb', line 67 def df @df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors) end |
#each_group ⇒ Object
Iterate over each group created by group_by. A DataFrame is yielded in block.
74 75 76 77 78 79 80 |
# File 'lib/daru/core/group_by.rb', line 74 def each_group return to_enum(:each_group) unless block_given? groups.keys.each do |k| yield get_group(k) end end |
#first ⇒ Object
Get the first group
117 118 119 |
# File 'lib/daru/core/group_by.rb', line 117 def first head(1) end |
#get_group(group) ⇒ Object
Returns one of the selected groups as a DataFrame.
258 259 260 261 262 263 264 265 266 267 |
# File 'lib/daru/core/group_by.rb', line 258 def get_group group indexes = groups_by_idx[group] elements = @context.each_vector.map(&:to_a) transpose = elements.transpose rows = indexes.each.map { |idx| transpose[idx] } Daru::DataFrame.rows( rows, index: indexes, order: @context.vectors ) end |
#groups ⇒ Object Also known as: groups_by_idx
lazy accessor/attr_reader for the attribute groups
61 62 63 |
# File 'lib/daru/core/group_by.rb', line 61 def groups @groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index) end |
#head(quantity = 5) ⇒ Object
Get the top ‘n’ groups
145 146 147 |
# File 'lib/daru/core/group_by.rb', line 145 def head quantity=5 select_groups_from :first, quantity end |
#inspect ⇒ Object
302 303 304 |
# File 'lib/daru/core/group_by.rb', line 302 def inspect grouped_df.inspect end |
#last ⇒ Object
Get the last group
122 123 124 |
# File 'lib/daru/core/group_by.rb', line 122 def last tail(1) end |
#max ⇒ Object
Find the max element of each numeric vector group.
233 234 235 |
# File 'lib/daru/core/group_by.rb', line 233 def max apply_method :numeric, :max end |
#mean ⇒ Object
Calculate mean of numeric groups, excluding missing values.
189 190 191 |
# File 'lib/daru/core/group_by.rb', line 189 def mean apply_method :numeric, :mean end |
#median ⇒ Object
Calculate the median of numeric groups, excluding missing values.
194 195 196 |
# File 'lib/daru/core/group_by.rb', line 194 def median apply_method :numeric, :median end |
#min ⇒ Object
Find the min element of each numeric vector group.
238 239 240 |
# File 'lib/daru/core/group_by.rb', line 238 def min apply_method :numeric, :min end |
#reduce(init = nil) {|block| ... } ⇒ Object
Iteratively applies a function to the values in a group and accumulates the result.
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
# File 'lib/daru/core/group_by.rb', line 285 def reduce(init=nil) result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h| group_indices = indices.map { |v| @context.index.to_a[v] } grouped_result = init group_indices.each do |idx| grouped_result = yield(grouped_result, @context.row[idx]) end h[group] = grouped_result end index = get_grouped_index(result_hash.keys) Daru::Vector.new(result_hash.values, index: index) end |
#size ⇒ Object
Get a Daru::Vector of the size of each group.
109 110 111 112 113 114 |
# File 'lib/daru/core/group_by.rb', line 109 def size index = get_grouped_index values = @groups_by_pos.values.map(&:size) Daru::Vector.new(values, index: index, name: :size) end |
#std ⇒ Object
Calculate sample standard deviation of numeric vector groups, excluding missing values.
228 229 230 |
# File 'lib/daru/core/group_by.rb', line 228 def std apply_method :numeric, :std end |
#sum ⇒ Object
Calculate sum of numeric groups, excluding missing values.
199 200 201 |
# File 'lib/daru/core/group_by.rb', line 199 def sum apply_method :numeric, :sum end |
#tail(quantity = 5) ⇒ Object
Get the bottom ‘n’ groups
168 169 170 |
# File 'lib/daru/core/group_by.rb', line 168 def tail quantity=5 select_groups_from :last, quantity end |