Module: Eco::Data::FuzzyMatch::InstanceMethods

Includes:
StopWords
Defined in:
lib/eco/data/fuzzy_match.rb

Constant Summary collapse

FUZZY_MATCH_OPTIONS =
[
  :identities, :groupings, :stop_words, :read,
  :must_match_grouping, :must_match_at_least_one_word,
  :gather_last_result, :threshold
]
JARO_OPTIONS =
[:ignore_case, :weight]
NGRAMS_OPTIONS =
[:range]
POSITION_OPTIONS =
[:max_distance]
RESULTS_OPTIONS =
[:order, :threshold]

Constants included from StopWords

StopWords::ARTICLES, StopWords::PREPOSITIONS, StopWords::PRONOUNS

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#fuzzy_optionsObject

Returns the value of attribute fuzzy_options.



53
54
55
# File 'lib/eco/data/fuzzy_match.rb', line 53

def fuzzy_options
  @fuzzy_options
end

Instance Method Details

#find_all_with_score(needle, needle_str: nil, haystack: nil, **options) ⇒ Eco::Data::FuzzyMatch::Results

Note:
  • When the haystack elements are non String objects, it excludes the needle itself from the results

TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold

Parameters:

  • needle (String, Object)

    object is allowed when fuzzy_options includes read: key.

  • needle_str (String, nil) (defaults to: nil)

    the actual value of needle_str to be used.

  • haystack (Enumerable) (defaults to: nil)

    the items to find needle among.

Returns:



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/eco/data/fuzzy_match.rb', line 79

def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
  base_match    = fuzzy_match(haystack, **options)
  match_results = base_match.find_all_with_score(needle_str || needle)
  needle_str  ||= item_string(needle)
  results       = match_results.each_with_object([]) do |fuzzy_results, results|
    item, dice, lev = fuzzy_results
    unless item == needle
      item_str     = item_string(item)

      if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
        dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
      end

      jaro_res     ||= jaro(needle_str, item_str)
      ngram_res    ||= ngram(needle_str, item_str)
      wngram_res   ||= words_ngram(needle_str, item_str)
      pos_res      ||= position(needle_str, item_str)

      results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
    end
  end
  Results.new(needle, needle_str, results).tap do |res|
    res.order     = fuzzy_options[:order]     if fuzzy_options[:order]
    res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
  end.relevant_results
end

#fuzzy_match(haystack_data = nil, **options) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/eco/data/fuzzy_match.rb', line 59

def fuzzy_match(haystack_data = nil, **options)
  if instance_variable_defined?(:@fuzzy_match) && !haystack_data
    return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
  end
  @fuzzy_options = options

  # make it run with a native C extension (for better performance: ~130 % increase of performance)
  require 'fuzzy_match'
  require 'amatch'
  ::FuzzyMatch.engine = :amatch
  @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
end

#recalculate_results(results, needle_str: nil, **options) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/eco/data/fuzzy_match.rb', line 106

def recalculate_results(results, needle_str: nil, **options)
  raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
  new_results = results.each_with_object([]) do |result, new_results|
    nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)

    if istr.to_s.strip.empty?
      dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 1
    elsif nstr.to_s.strip.empty?
      unless istr = needle_str
        dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
      end
    end

    require 'fuzzy_match'
    require 'amatch'
    res          = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
    
    dice       ||= res&.dices_coefficient_similar || 0
    lev        ||= res&.levenshtein_similar       || 0
    jaro_res   ||= jaro(nstr, istr)
    ngram_res  ||= ngram(nstr, istr)
    wngram_res ||= words_ngram(nstr, istr)
    pos_res    ||= position(nstr, istr)

    new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
  end
  Results.new(results.needle, results.value, new_results).tap do |res|
    res.order     = options[:order]     if options[:order]
    res.threshold = options[:threshold] if options[:threshold]
  end.relevant_results
end