Class: Linkage::Comparators::Strcompare

Inherits:
Linkage::Comparator show all
Defined in:
lib/linkage/comparators/strcompare.rb

Overview

Strcompare is a string comparison comparator. It uses the specified operation to compare string-type fields. Score ranges from 0 to 1.

To use Strcompare, you must specify one field for each record to use in the comparison, along with an operator. Valid operators are:

Consider the following example, using a Linkage::Configuration as part of Dataset#link_with:

config.strcompare(:foo, :bar, :jarowinkler)

For each record, the values of the foo and bar fields are compared using the Jaro-Winkler distance algorithm.

Damerau-Levenshtein is a modified Levenshtein that allows for transpositions It has additionally been modified to make costs of additions or deletions only 0.5

Constant Summary collapse

VALID_OPERATIONS =
[:jarowinkler, :reverse_jarowinkler, :damerau_levenshtein]

Instance Attribute Summary

Attributes inherited from Linkage::Comparator

#weight

Instance Method Summary collapse

Methods inherited from Linkage::Comparator

klass_for, register, #score_and_notify, #score_dataset, #score_datasets, #type, #weigh

Constructor Details

#initialize(field_1, field_2, operation) ⇒ Strcompare

Returns a new instance of Strcompare.



27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/linkage/comparators/strcompare.rb', line 27

def initialize(field_1, field_2, operation)
  if field_1.ruby_type[:type] != String || field_2.ruby_type[:type] != String
    raise "fields must be string types"
  end
  if !VALID_OPERATIONS.include?(operation)
    raise "#{operation.inspect} is not a valid operation"
  end

  @name_1 = field_1.name
  @name_2 = field_2.name
  @operation = operation
end

Instance Method Details

#damerau_levenshtein(w1, w2) ⇒ Object



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/linkage/comparators/strcompare.rb', line 102

def damerau_levenshtein(w1, w2)
  a = w1.downcase
  b = w2.downcase
  aa = a.split('')
  ba = b.split('')
  al = a.length
  bl = b.length
  denom = [al, bl].max
  return 0 if denom == 0
  oneago = nil
  thisrow = (1..bl).to_a + [0]
  al.times do |x|
    twoago, oneago, thisrow = oneago, thisrow, [0] * bl + [x + 1]
    bl.times do |y|
      if aa[x] == ba[y]
        thisrow[y] = oneago[y - 1]
      else
        delcost = oneago[y] + 0.5
        addcost = thisrow[y - 1] + 0.5
        subcost = oneago[y - 1] + 1
        thisrow[y] = [delcost, addcost, subcost].min
        # remove this statement for original levenshtein
        if x > 0 and y > 0 and aa[x] == ba[y-1] and aa[x-1] == ba[y]
          thisrow[y] = [thisrow[y], twoago[y-2] + 1].min
        end
      end
    end
  end
  return (1 - thisrow[bl - 1] / denom.to_f).round(3)
end

#jarowinkler(w1, w2) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/linkage/comparators/strcompare.rb', line 54

def jarowinkler(w1, w2)
  a = w1.downcase
  b = w2.downcase
  aa = a.split('')
  ba = b.split('')
  al = a.length
  bl = b.length
  return 0 if al == 0 || bl == 0
  l = 0
  for i in Range.new(0, [[al, bl].min, 4].min-1)
    break if aa[i] != ba[i]
    l += 1
  end
  md = [[al, bl].max/2 - 1, 1].max
  usea = []
  useb = []
  # simplify to matching characters
  for i in Range.new(0, al-1)
    fi = [[i - md, 0].max, bl-1].min
    li = [i + md, bl-1].min
    for j in Range.new(fi, li)
      if aa[i] == ba[j] and not useb.include?(j)
        usea << i
        useb << j
        break
      end
    end
  end
  bada = Range.new(0, al-1).to_a - usea
  badb = Range.new(0, bl-1).to_a - useb
  bada.reverse.each { |x| aa.delete_at(x) }
  badb.reverse.each { |x| ba.delete_at(x) }
  nm = aa.length
  return 0 if nm == 0
  # count transpositions
  nt = 0
  for i in Range.new(0, nm-1)
    nt +=1 if aa[i] != ba[i]
  end
  d = (nm/al.to_f + nm/bl.to_f + (nm-nt/2.0)/nm.to_f)/3.0
  w = (d + l * 0.1 * (1 - d)).round(3)
  w
end

#reverse_jarowinkler(w1, w2) ⇒ Object



98
99
100
# File 'lib/linkage/comparators/strcompare.rb', line 98

def reverse_jarowinkler(w1, w2)
  jarowinkler(w1.reverse, w2.reverse)
end

#score(record_1, record_2) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/linkage/comparators/strcompare.rb', line 40

def score(record_1, record_2)
  result =
    case @operation
    when :jarowinkler
      jarowinkler(record_1[@name_1], record_2[@name_2])
    when :reverse_jarowinkler
      reverse_jarowinkler(record_1[@name_1], record_2[@name_2])
    when :damerau_levenshtein
      damerau_levenshtein(record_1[@name_1], record_2[@name_2])
    end

  result
end