Module: Eco::Data::FuzzyMatch::InstanceMethods

Includes:
StopWords
Defined in:
lib/eco/data/fuzzy_match.rb

Constant Summary collapse

FUZZY_MATCH_OPTIONS =
[
  :identities, :groupings, :stop_words, :read,
  :must_match_grouping, :must_match_at_least_one_word,
  :gather_last_result, :threshold
]
JARO_OPTIONS =
[:ignore_case, :weight]
NGRAMS_OPTIONS =
[:range]
POSITION_OPTIONS =
[:max_distance]
RESULTS_OPTIONS =
[:order, :threshold]

Constants included from StopWords

StopWords::ARTICLES, StopWords::PREPOSITIONS, StopWords::PRONOUNS

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#fuzzy_optionsObject

Returns the value of attribute fuzzy_options.



55
56
57
# File 'lib/eco/data/fuzzy_match.rb', line 55

def fuzzy_options
  @fuzzy_options
end

Instance Method Details

#find_all_with_score(needle, needle_str: nil, haystack: nil, **options) ⇒ Eco::Data::FuzzyMatch::Results

Note:
  • When the haystack elements are non String objects, it excludes the needle itself from the results

TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold

Parameters:

  • needle (String, Object)

    object is allowed when fuzzy_options includes read: key.

  • needle_str (String, nil) (defaults to: nil)

    the actual value of needle_str to be used.

  • haystack (Enumerable) (defaults to: nil)

    the items to find needle among.

Returns:



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/eco/data/fuzzy_match.rb', line 78

def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
  base_match    = fuzzy_match(haystack, **options)
  match_results = base_match.find_all_with_score(needle_str || needle)
  needle_str  ||= item_string(needle)
  results       = match_results.each_with_object([]) do |fuzzy_results, results|
    item, dice, lev = fuzzy_results
    unless item == needle
      item_str     = item_string(item)

      if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
        dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
      end

      jaro_res     ||= jaro(needle_str, item_str)
      ngram_res    ||= ngram(needle_str, item_str)
      wngram_res   ||= words_ngram(needle_str, item_str)
      pos_res      ||= position(needle_str, item_str)

      results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
    end
  end
  Results.new(needle, needle_str, results).tap do |res|
    res.order     = fuzzy_options[:order]     if fuzzy_options[:order]
    res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
  end.relevant_results
end

#fuzzy_match(haystack_data = nil, **options) ⇒ Object



61
62
63
64
65
66
67
68
69
# File 'lib/eco/data/fuzzy_match.rb', line 61

def fuzzy_match(haystack_data = nil, **options)
  if instance_variable_defined?(:@fuzzy_match) && !haystack_data
    return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
  end
  @fuzzy_options = options
  # make it run with a native C extension (for better performance: ~130 % increase of performance)
  ::FuzzyMatch.engine = :amatch
  @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
end

#recalculate_results(results, needle_str: nil, **options) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/eco/data/fuzzy_match.rb', line 105

def recalculate_results(results, needle_str: nil, **options)
  raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
  new_results = results.each_with_object([]) do |result, new_results|
    nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)

    if istr.to_s.strip.empty?
      dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 1
    elsif nstr.to_s.strip.empty?
      unless istr = needle_str
        dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
      end
    end

    res          = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
    dice       ||= res&.dices_coefficient_similar || 0
    lev        ||= res&.levenshtein_similar       || 0
    jaro_res   ||= jaro(nstr, istr)
    ngram_res  ||= ngram(nstr, istr)
    wngram_res ||= words_ngram(nstr, istr)
    pos_res    ||= position(nstr, istr)

    new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
  end
  Results.new(results.needle, results.value, new_results).tap do |res|
    res.order     = options[:order]     if options[:order]
    res.threshold = options[:threshold] if options[:threshold]
  end.relevant_results
end