Class: FamilyReunion::TaxamatchPreprocessor

Inherits:
Object
  • Object
show all
Defined in:
lib/family-reunion/taxamatch_preprocessor.rb

Instance Method Summary collapse

Constructor Details

#initialize(cache) ⇒ TaxamatchPreprocessor

Returns a new instance of TaxamatchPreprocessor.



4
5
6
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 4

def initialize(cache)
  @cache = cache
end

Instance Method Details

#get_letters(word) ⇒ Object



93
94
95
96
97
98
99
100
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 93

def get_letters(word)
  letters = @cache.word_letters[word]
  if letters == nil
    letters = word.split('').uniq
    @cache.word_letters[word] = letters
  end
  letters
end

#get_match_candidates(list1, list2) ⇒ Object



8
9
10
11
12
13
14
15
16
17
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 8

def get_match_candidates(list1, list2)
  match_candidates = {:uninomials => {}, :binomials => {}, :trinomials => {}}
  partitioned_names1 = partition_canonicals(list1)
  partitioned_names2 = partition_canonicals(list2)
  [:uninomials, :binomials, :trinomials].each do |bucket|
    candidates = self.send("process_#{bucket}", partitioned_names1[bucket], partitioned_names2[bucket])
    match_candidates[bucket].merge!(candidates)                      
  end
  match_candidates
end

#partition_canonicals(canonicals) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 19

def partition_canonicals(canonicals)
  partitions = { :uninomials => [], :binomials => [], :trinomials => [], :multinomials => [] }
  canonicals.each do |name|
    words = name.split(' ')
    key = case words.size
          when 1
            :uninomials
          when 2
            :binomials
          when 3
            :trinomials
          else
            :multinomials
          end
    partitions[key] << [name, words]
  end
  partitions
end

#process_binomials(names1, names2) ⇒ Object



49
50
51
52
53
54
55
56
57
58
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 49

def process_binomials(names1, names2)
  names1.inject({}) do |res, n1|
    names2.each do |n2|
      if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1])
        res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
      end
    end
    res
  end
end

#process_trinomials(names1, names2) ⇒ Object



60
61
62
63
64
65
66
67
68
69
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 60

def process_trinomials(names1, names2)
  names1.inject({}) do |res, n1|
    names2.each do |n2|
      if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) && similar_words?(n1[1][2], n2[1][2])
        res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
      end
    end
    res
  end
end

#process_uninomials(names1, names2) ⇒ Object



38
39
40
41
42
43
44
45
46
47
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 38

def process_uninomials(names1, names2)
  names1.inject({}) do |res, n1|
    names2.each do |n2|
      if similar_words?(n1[1][0], n2[1][0])
        res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
      end
    end
    res
  end
end

#similar_words?(word1, word2) ⇒ Boolean

Returns:

  • (Boolean)

Raises:

  • (RuntimeError)


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 71

def similar_words?(word1, word2)
  raise RuntimeError unless (word1.is_a?(String) && word2.is_a?(String))
  
  key = [word1, word2].sort.join(':')
  cached = @cache.similar_words[key]
  return cached if cached != nil
    
  are_similar = false
  if word1 == word2
    are_similar = true
  else 
    letters1 = get_letters(word1)
    letters2 = get_letters(word2)
    symmertric_difference = (letters1 - letters2) + (letters2 - letters1)
    similar_letters = symmertric_difference.size.to_f/(letters1.size + letters2.size) <= 0.3
    similar_length = (word1.size - word2.size).abs.to_f/word1.size <= 0.2
    are_similar = similar_letters && similar_length
  end 
  @cache.similar_words[key] = are_similar
  are_similar
end