Module: DeduplicateData

Included in:
IndexManager
Defined in:
lib/deduplicate_data.rb

Instance Method Summary collapse

Instance Method Details

#bothNotNil?(second_item) ⇒ Boolean

Check if both are nil (and return false/no match if not)

Returns:

  • (Boolean)


88
89
90
# File 'lib/deduplicate_data.rb', line 88

def bothNotNil?(second_item)
  return !second_item == nil
end

#deduplicate(item, dataspec, doc_class) ⇒ Object

Deduplicate Items



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/deduplicate_data.rb', line 3

def deduplicate(item, dataspec, doc_class)
  # Check if any item from same profile has been added
  begin
    potential_dups = doc_class.search(query: { match: { dataspec.id_field => item[dataspec.id_field] }}).results
  rescue
    
  end
  # Check if there are any entries for that item
  if !potential_dups.empty?
    potential_dups.each do |dup_i|
      # See if it is exact match or not
      if exactMatch?(removeIgnore(item, dataspec).symbolize_keys, removeIgnore(dup_i.to_hash, dataspec))
        # Check if matching item was scraped after saved item
        if Date.parse(item[dataspec.dedup_prioritize]) > Date.parse(dup_i[dataspec.dedup_prioritize].to_s)
          return true # TODO: Delete the old item and create a new one instead
        else
          return false # Existing item is more recent
        end
      else
        return true # A different entry for same item
      end
    end
  else
    return true # No other entries for item
  end
end

#exactMatch?(first_item, second_item) ⇒ Boolean

See if all the fields in first item match all the fiels in the second item

Returns:

  • (Boolean)


41
42
43
44
45
46
47
48
49
50
# File 'lib/deduplicate_data.rb', line 41

def exactMatch?(first_item, second_item)
  first_item.each do |key, value|
    # Compare the two fields for that same key against each other
    if second_item[key]
      matching = fieldValsMatch?(value, second_item[key], key)
      return matching if !matching
    end
  end
  return true # If all match
end

#fieldValsMatch?(first_val, second_val, key) ⇒ Boolean

Return false if the two fields don't match

Returns:

  • (Boolean)


53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/deduplicate_data.rb', line 53

def fieldValsMatch?(first_val, second_val, key)
  # Check if it is a date, not nil, and not a year
  if isDate?(key.to_s, dataspec) && first_val != nil && first_val.length > 4
  elsif first_val == nil # Check if both are nil
    return false if bothNotNil?(second_val)
  elsif isNonIntNum?(first_val) # Check if they match when converted to int
    return false if !matchAsInt?(first_val, second_val)
  elsif isEmpty?(first_val) # Check if both are empty
    return false if !isEmpty?(second_val)
  elsif simplyDoesntMatch?(first_val, second_val) # Check if they match
    return false
  end
end

#isEmpty?(value) ⇒ Boolean

Checks if value is empty

Returns:

  • (Boolean)


78
79
80
# File 'lib/deduplicate_data.rb', line 78

def isEmpty?(value)
  return !value.is_a?(Integer) && (value != nil) && value.empty?
end

#isNonIntNum?(value) ⇒ Boolean

Checks if val is in int or float

Returns:

  • (Boolean)


73
74
75
# File 'lib/deduplicate_data.rb', line 73

def isNonIntNum?(value)
  return value.is_a?(Integer) || value.is_a?(Float)
end

#matchAsInt?(value, second_item) ⇒ Boolean

Checks if num values match after being converted to same type

Returns:

  • (Boolean)


83
84
85
# File 'lib/deduplicate_data.rb', line 83

def matchAsInt?(value, second_item)
  return value.to_i != second_item.to_i
end

#removeIgnore(item, dataspec) ⇒ Object

Remove ignore fields for comparison



31
32
33
34
35
36
37
38
# File 'lib/deduplicate_data.rb', line 31

def removeIgnore(item, dataspec)
  itemcopy = item.dup
  dataspec.dedup_ignore.each do |remove|
    itemcopy = itemcopy.except(remove, remove.to_sym)
  end

  return itemcopy
end

#simplyDoesntMatch?(value, second_item, key) ⇒ Boolean

Returns true if it doesn't match

Returns:

  • (Boolean)


68
69
70
# File 'lib/deduplicate_data.rb', line 68

def simplyDoesntMatch?(value, second_item, key)
  return second_item != value
end