Module: Daru::Core::MergeHelper

Defined in:
lib/daru/core/merge.rb

Class Method Summary collapse

Class Method Details

.arrayify(df) ⇒ Object



37
38
39
40
41
42
43
# File 'lib/daru/core/merge.rb', line 37

def arrayify df
  arr = df.to_a
  col_names = arr[0][0].keys
  values = arr[0].map(&:values)

  [col_names, values]
end

.arrayify_with_sort_keys(size, df_hash, on) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/daru/core/merge.rb', line 45

def arrayify_with_sort_keys(size, df_hash, on)
  # Converting to a hash and then to an array is more complex
  # than using df.to_a or df.map(:row).  However, it's
  # substantially faster this way.

  # idx_keys = on.map { |key| df_hash.keys.index(key) }

  (0...size).reduce([]) do |r, idx|
    key_values = on.map { |col| df_hash[col][idx] }
    row_values = df_hash.map { |_col, val| val[idx] }
    r << [key_values, row_values]
  end

  # Conceptually simpler and does the same thing, but slows down the
  # total merge algorithm by 2x.  Would be nice to improve the performance
  # of df.map(:row)
  #
  # df.map(:row) do |row|
  #   key_values = on.map { |key| row[key] }
  #   [key_values, row.to_a]
  # end
end

.hashify(df) ⇒ Object



31
32
33
34
35
# File 'lib/daru/core/merge.rb', line 31

def hashify df
  hsh = df.to_h
  hsh.each { |k,v| hsh[k] = v.to_a }
  hsh
end

.replace_keys_if_duplicates(hash, matcher) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/daru/core/merge.rb', line 5

def replace_keys_if_duplicates hash, matcher
  matched = nil
  hash.keys.each { |d|
    if matcher.match(Regexp.new(d.to_s))
      matched = d
      break
    end
  }

  return unless matched

  hash[matcher] = hash[matched]
  hash.delete matched
end

.resolve_duplicates(df_hash1, df_hash2, on) ⇒ Object



20
21
22
23
24
25
26
27
28
29
# File 'lib/daru/core/merge.rb', line 20

def resolve_duplicates df_hash1, df_hash2, on
  hk = df_hash1.keys + df_hash2.keys - on
  recoded = hk.recode_repeated.map(&:to_sym)
  diff = (recoded - hk).sort

  diff.each_slice(2) do |a|
    replace_keys_if_duplicates df_hash1, a[0]
    replace_keys_if_duplicates df_hash2, a[1]
  end
end

.verify_dataframes(df_hash1, df_hash2, on) ⇒ Object

Raises:

  • (ArgumentError)


68
69
70
71
72
73
# File 'lib/daru/core/merge.rb', line 68

def verify_dataframes df_hash1, df_hash2, on
  raise ArgumentError,
    'All fields in :on must be present in self' unless on.all? { |e| df_hash1[e] }
  raise ArgumentError,
    'All fields in :on must be present in other DF' unless on.all? { |e| df_hash2[e] }
end