Module: Daru::Core::MergeHelper
- Defined in:
- lib/daru/core/merge.rb
Class Method Summary collapse
- .arrayify(df) ⇒ Object
- .arrayify_with_sort_keys(size, df_hash, on) ⇒ Object
- .hashify(df) ⇒ Object
- .replace_keys_if_duplicates(hash, matcher) ⇒ Object
- .resolve_duplicates(df_hash1, df_hash2, on) ⇒ Object
- .verify_dataframes(df_hash1, df_hash2, on) ⇒ Object
Class Method Details
.arrayify(df) ⇒ Object
37 38 39 40 41 42 43 |
# File 'lib/daru/core/merge.rb', line 37 def arrayify df arr = df.to_a col_names = arr[0][0].keys values = arr[0].map(&:values) [col_names, values] end |
.arrayify_with_sort_keys(size, df_hash, on) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/daru/core/merge.rb', line 45 def arrayify_with_sort_keys(size, df_hash, on) # Converting to a hash and then to an array is more complex # than using df.to_a or df.map(:row). However, it's # substantially faster this way. # idx_keys = on.map { |key| df_hash.keys.index(key) } (0...size).reduce([]) do |r, idx| key_values = on.map { |col| df_hash[col][idx] } row_values = df_hash.map { |_col, val| val[idx] } r << [key_values, row_values] end # Conceptually simpler and does the same thing, but slows down the # total merge algorithm by 2x. Would be nice to improve the performance # of df.map(:row) # # df.map(:row) do |row| # key_values = on.map { |key| row[key] } # [key_values, row.to_a] # end end |
.hashify(df) ⇒ Object
31 32 33 34 35 |
# File 'lib/daru/core/merge.rb', line 31 def hashify df hsh = df.to_h hsh.each { |k,v| hsh[k] = v.to_a } hsh end |
.replace_keys_if_duplicates(hash, matcher) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/daru/core/merge.rb', line 5 def replace_keys_if_duplicates hash, matcher matched = nil hash.keys.each { |d| if matcher.match(Regexp.new(d.to_s)) matched = d break end } return unless matched hash[matcher] = hash[matched] hash.delete matched end |
.resolve_duplicates(df_hash1, df_hash2, on) ⇒ Object
20 21 22 23 24 25 26 27 28 29 |
# File 'lib/daru/core/merge.rb', line 20 def resolve_duplicates df_hash1, df_hash2, on hk = df_hash1.keys + df_hash2.keys - on recoded = hk.recode_repeated.map(&:to_sym) diff = (recoded - hk).sort diff.each_slice(2) do |a| replace_keys_if_duplicates df_hash1, a[0] replace_keys_if_duplicates df_hash2, a[1] end end |
.verify_dataframes(df_hash1, df_hash2, on) ⇒ Object
68 69 70 71 72 73 |
# File 'lib/daru/core/merge.rb', line 68 def verify_dataframes df_hash1, df_hash2, on raise ArgumentError, 'All fields in :on must be present in self' unless on.all? { |e| df_hash1[e] } raise ArgumentError, 'All fields in :on must be present in other DF' unless on.all? { |e| df_hash2[e] } end |