Class: Daru::Core::MergeFrame

Inherits:
Object
  • Object
show all
Defined in:
lib/daru/core/merge.rb

Instance Method Summary collapse

Constructor Details

#initialize(df1, df2, on: nil) ⇒ MergeFrame

Returns a new instance of MergeFrame.



78
79
80
81
82
# File 'lib/daru/core/merge.rb', line 78

def initialize(df1, df2, on: nil)
  @df1 = df1
  @df2 = df2
  @on = on
end

Instance Method Details

#inner(_opts) ⇒ Object



84
85
86
# File 'lib/daru/core/merge.rb', line 84

def inner _opts
  merge_join(left: false, right: false)
end

#left(_opts) ⇒ Object



88
89
90
# File 'lib/daru/core/merge.rb', line 88

def left _opts
  merge_join(left: true, right: false)
end

#merge_join(left: true, right: true) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/daru/core/merge.rb', line 100

def merge_join(left: true, right: true)
  MergeHelper.verify_dataframes df1_hash, df2_hash, @on
  MergeHelper.resolve_duplicates df1_hash, df2_hash, @on

  # TODO: Use native dataframe sorting.
  #  It would be ideal to reuse sorting functionality that is native
  #  to dataframes.  Unfortunately, native dataframe sort introduces
  #  an overhead that reduces join performance by a factor of 4!  Until
  #  that aspect is improved, we resort to a simpler array sort.
  df1_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }
  df2_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }

  idx1 = 0
  idx2 = 0

  while idx1 < @df1.size || idx2 < @df2.size

    key1 = df1_array[idx1][0] if idx1 < @df1.size
    key2 = df2_array[idx2][0] if idx2 < @df2.size

    if key1 == key2 && idx1 < @df1.size && idx2 < @df2.size
      idx2_start = idx2

      while (idx2 < @df2.size) && (df1_array[idx1][0] == df2_array[idx2][0])
        add_merge_row_to_hash([df1_array[idx1], df2_array[idx2]], joined_hash)
        idx2 += 1
      end

      idx2 = idx2_start if idx1+1 < @df1.size && df1_array[idx1][0] == df1_array[idx1+1][0]
      idx1 += 1
    elsif ((key2.nil? || [key1,key2].sort == [key1,key2]) && idx1 < @df1.size) || idx2 == @df2.size
      add_merge_row_to_hash([df1_array[idx1], nil], joined_hash) if left
      idx1 += 1
    elsif idx2 < @df2.size || idx1 == @df1.size
      add_merge_row_to_hash([nil, df2_array[idx2]], joined_hash) if right
      idx2 += 1
    else
      raise 'Unexpected condition met during merge'
    end
  end

  Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
end

#outer(_opts) ⇒ Object



96
97
98
# File 'lib/daru/core/merge.rb', line 96

def outer _opts
  merge_join(left: true, right: true)
end

#right(_opts) ⇒ Object



92
93
94
# File 'lib/daru/core/merge.rb', line 92

def right _opts
  merge_join(left: false, right: true)
end