Module: Eco::Data::FuzzyMatch::Pairing
- Included in:
- ClassMethods
- Defined in:
- lib/eco/data/fuzzy_match/pairing.rb
Instance Method Summary collapse
-
#paired_words(str1, str2, normalized: false) {|needle, item| ... } ⇒ Hash
Pair words using some algorithm.
Instance Method Details
#paired_words(str1, str2, normalized: false) {|needle, item| ... } ⇒ Hash
Pair words using some algorithm. It does the following:
- It splits both strings into words.
- Pairs all words by using
blockto score the best match. - Gives
0score to those words ofstr2that lost their pair (a word ofstr1cannot be paired twice). - Merges the
Scoreof all the paired words ofstr2against theirstr1word pair.
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/eco/data/fuzzy_match/pairing.rb', line 20 def paired_words(str1, str2, normalized: false) str1, str2 = normalize_string([str1, str2]) unless normalized return {nil => [nil, Score.new(0, 0)]} if !str2 || !str1 return {str1 => [nil, Score.new(0, 0)]} if str1.length < 2 || str1.length < 2 needles = get_words(str1, normalized: true) haystack = get_words(str2, normalized: true) ranking = {} faceted = needles.each_with_object({}) do |needle, faceted| faceted[needle] = haystack.map do |item| { pair: item, score: yield(needle, item) }.tap do |result| ranking[item] ||= [] if result[:score].ratio > 0.05 ranking[item] << ({needle: needle, score: result[:score]}) end end end.sort_by do |result| result[:score].ratio end.reverse end paired = {} #scores = {} ranking.each do |item, results| sorted = results.reject do |result| paired.key?(result[:needle]) end.sort_by do |result| result[:score].ratio end.reverse if result = sorted.shift unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score) raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{result[:needle]}' and item #{item}" end paired[result[:needle]] = { pair: item, score: result[:score] } end end pending_items = haystack - paired.values faceted.reject do |needle, results| paired.key?(needle) end.each do |needle, results| results.select! do |result| pending_items.include?(result[:pair]) && result[:score].ratio > 0.05 end if result = results.shift unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score) raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{needle}' and item #{result[:pair]}" end paired[needle] = result pending_items.delete(result[:pair]) end end pending_needles = needles - paired.keys pending_needles.each do |needle| paired[needle] = { pair: nil, score: Score.new(0, needle.length) } end paired.each_with_object({}) do |(needle, data), out| out[needle] = data.values_at(:pair, :score) end end |