Module: CSVDiff::Algorithm

Included in:
CSVDiff
Defined in:
lib/csv-diff/algorithm.rb

Overview

Implements the CSV diff algorithm.

Defined Under Namespace

Classes: Diff

Instance Method Summary collapse

Instance Method Details

#diff_row(left_row, right_row, fields) ⇒ Hash<String, Array>

Identifies the fields that are different between two versions of the same row.

Parameters:

  • left_row (Hash)

    The version of the CSV row from the left/from file.

  • right_row (Hash)

    The version of the CSV row from the right/to file.

  • fields (Array<String>)

    An array of field names to compare.

Returns:

  • (Hash<String, Array>)

    A Hash whose keys are the fields that contain differences, and whose values are a two-element array of

    left/from, right/to

    values.



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/csv-diff/algorithm.rb', line 192

def diff_row(left_row, right_row, fields)
    diffs = {}
    fields.each do |attr|
        eq_proc = @equality_procs[attr]
        right_val = right_row[attr]
        right_val = nil if right_val == ""
        left_val = left_row[attr]
        left_val = nil if left_val == ""
        if eq_proc
            diffs[attr] = [left_val, right_val] unless eq_proc.call(left_val, right_val)
        elsif @case_sensitive
            diffs[attr] = [left_val, right_val] unless left_val == right_val
        elsif (left_val.to_s.upcase != right_val.to_s.upcase)
            diffs[attr] = [left_val, right_val]
        end
    end
    diffs if diffs.size > 0
end

#diff_sources(left, right, key_fields, diff_fields, options = {}) ⇒ Object

Diffs two CSVSource structures.

Parameters:

  • left (CSVSource)

    A CSVSource object containing the contents of the left/from input.

  • right (CSVSource)

    A CSVSource object containing the contents of the right/to input.

  • key_fields (Array)

    An array containing the names of the field(s) that uniquely identify each row.

  • diff_fields (Array)

    An array containing the names of the fields to be diff-ed.

  • options (Hash) (defaults to: {})

    An options hash.

Options Hash (options):

  • :ignore_adds (Boolean)

    If set to true, we ignore any new items that appear only in right.

  • :ignore_moves (Boolean)

    If set to true, we ignore any changes in sibling order.

  • :ignore_updates (Boolean)

    If set to true, we ignore any items that exist in both left and right.

  • :ignore_deletes (Boolean)

    If set to true, we ignore any new items that appear only in left.

  • :equality_procs (Hash<Object,Proc>)

    A Hash mapping fields to a 2-arg Proc that should be used to compare values in that field for equality.



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/csv-diff/algorithm.rb', line 77

def diff_sources(left, right, key_fields, diff_fields, options = {})
    unless left.case_sensitive? == right.case_sensitive?
        raise ArgumentError, "Left and right must have same settings for case-sensitivity"
    end
    unless left.parent_fields.length == right.parent_fields.length
        raise ArgumentError, "Left and right must have same settings for parent/child fields"
    end

    # Ensure key fields are not also in the diff_fields
    diff_fields = diff_fields - key_fields

    left_index = left.index
    left_values = left.lines
    left_keys = left_values.keys
    right_index = right.index
    right_values = right.lines
    right_keys = right_values.keys
    parent_field_count = left.parent_fields.length

    include_adds = !options[:ignore_adds]
    include_moves = !options[:ignore_moves]
    include_updates = !options[:ignore_updates]
    include_deletes = !options[:ignore_deletes]

    @case_sensitive = left.case_sensitive?
    @equality_procs = options.fetch(:equality_procs, {})

    diffs = {}
    potential_moves = Hash.new{ |h, k| h[k] = [] }

    # First identify deletions
    if include_deletes
        (left_keys - right_keys).each do |key|
            # Delete
            key_vals = key.split('~', -1)
            parent = key_vals[0...parent_field_count].join('~')
            child = key_vals[parent_field_count..-1].join('~')
            left_parent = left_index[parent]
            left_value = left_values[key]
            row_idx = left_keys.index(key)
            sib_idx = left_parent.index(key)
            raise "Can't locate key #{key} in parent #{parent}" unless sib_idx
            diffs[key] = Diff.new(:delete, left_value, row_idx, sib_idx)
            potential_moves[child] << key
            #puts "Delete: #{key}"
        end
    end

    # Now identify adds/updates
    right_keys.each_with_index do |key, right_row_id|
        key_vals = key.split('~', -1)
        parent = key_vals[0...parent_field_count].join('~')
        left_parent = left_index[parent]
        right_parent = right_index[parent]
        left_value = left_values[key]
        right_value = right_values[key]
        left_idx = left_parent && left_parent.index(key)
        right_idx = right_parent && right_parent.index(key)

        if left_idx && right_idx
            if include_updates && (changes = diff_row(left_value, right_value, diff_fields))
                id = id_fields(key_fields, right_value)
                diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
                #puts "Change: #{key}"
            end
            if include_moves
                left_common = left_parent & right_parent
                right_common = right_parent & left_parent
                left_pos = left_common.index(key)
                right_pos = right_common.index(key)
                if left_pos != right_pos
                    # Move
                    if d = diffs[key]
                        d.sibling_position = [left_idx, right_idx]
                    else
                        id = id_fields(key_fields, right_value)
                        diffs[key] = Diff.new(:move, id, right_row_id, [left_idx, right_idx])
                    end
                    #puts "Move #{left_idx} -> #{right_idx}: #{key}"
                end
            end
        elsif right_idx
            # Add
            child = key_vals[parent_field_count..-1].join('~')
            if potential_moves.has_key?(child) && old_key = potential_moves[child].pop
                diffs.delete(old_key)
                if include_updates
                    left_value = left_values[old_key]
                    id = id_fields(right.child_fields, right_value)
                    changes = diff_row(left_value, right_value, left.parent_fields + diff_fields)
                    diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
                    #puts "Update Parent: #{key}"
                end
            elsif include_adds
                diffs[key] = Diff.new(:add, right_value, right_row_id, right_idx)
                #puts "Add: #{key}"
            end
        end
    end

    diffs
end