Module: Eco::Data::FuzzyMatch::InstanceMethods

Includes:
StopWords
Defined in:
lib/eco/data/fuzzy_match.rb

Constant Summary collapse

FUZZY_MATCH_OPTIONS =
%i[
  identities groupings stop_words read
  must_match_grouping must_match_at_least_one_word
  gather_last_result threshold
].freeze
JARO_OPTIONS =
%i[ignore_case weight].freeze
NGRAMS_OPTIONS =
%i[range].freeze
POSITION_OPTIONS =
%i[max_distance].freeze
RESULTS_OPTIONS =
%i[order threshold].freeze

Constants included from StopWords

StopWords::ARTICLES, StopWords::PREPOSITIONS, StopWords::PRONOUNS

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#fuzzy_optionsObject



57
58
59
# File 'lib/eco/data/fuzzy_match.rb', line 57

def fuzzy_options
  @fuzzy_options ||= {}
end

Instance Method Details

#find_all_with_score(needle, needle_str: nil, haystack: nil, **options) ⇒ Eco::Data::FuzzyMatch::Results

TODO:

integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold

Note:
  • When the haystack elements are non String objects, it excludes the needle itself from the results

Parameters:

  • needle (String, Object)

    object is allowed when fuzzy_options includes read: key.

  • needle_str (String, nil) (defaults to: nil)

    the actual value of needle_str to be used.

  • haystack (Enumerable) (defaults to: nil)

    the items to find needle among.

Returns:



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/eco/data/fuzzy_match.rb', line 84

def find_all_with_score(needle, needle_str: nil, haystack: nil, **options) # rubocop:disable Metrics/AbcSize
  base_match    = fuzzy_match(haystack, **options)
  match_results = base_match.find_all_with_score(needle_str || needle)
  needle_str  ||= item_string(needle)
  results       = match_results.each_with_object([]) do |fuzzy_results, results|
    item, dice, lev = fuzzy_results
    next if item == needle

    item_str = item_string(item)

    if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
      dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
    end

    jaro_res     ||= jaro(needle_str, item_str)
    ngram_res    ||= ngram(needle_str, item_str)
    wngram_res   ||= words_ngram(needle_str, item_str)
    pos_res      ||= position(needle_str, item_str)

    results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
  end

  Results.new(needle, needle_str, results).tap do |res|
    res.order     = fuzzy_options[:order]     if fuzzy_options[:order]
    res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
  end.relevant_results
end

#fuzzy_match(haystack_data = nil, **options) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/eco/data/fuzzy_match.rb', line 61

def fuzzy_match(haystack_data = nil, **options)
  if instance_variable_defined?(:@fuzzy_match) && !haystack_data
    return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options) # rubocop:disable Style/SoleNestedConditional
  end

  @fuzzy_options = options

  # make it run with a native C extension (for better performance: ~130 % increase of performance)
  require 'fuzzy_match'
  require 'amatch'

  ::FuzzyMatch.engine = :amatch
  @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
end

#recalculate_results(results, needle_str: nil, **options) ⇒ Object

rubocop:disable Metrics/AbcSize



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/eco/data/fuzzy_match.rb', line 112

def recalculate_results(results, needle_str: nil, **options) # rubocop:disable Metrics/AbcSize
  msg = "You should provide a block |needle_str, item_str, needle, item|"
  raise msg unless block_given?

  new_results = results.each_with_object([]) do |result, new_results|
    nstr, istr = yield(
      needle_str || results.value,
      result.value,
      results.needle,
      result.match
    )

    if istr.to_s.strip.empty?
      dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 1
    elsif nstr.to_s.strip.empty?
      unless (istr = needle_str)
        dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
      end
    end

    require 'fuzzy_match'
    require 'amatch'
    res          = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev

    dice       ||= res&.dices_coefficient_similar || 0
    lev        ||= res&.levenshtein_similar       || 0
    jaro_res   ||= jaro(nstr, istr)
    ngram_res  ||= ngram(nstr, istr)
    wngram_res ||= words_ngram(nstr, istr)
    pos_res    ||= position(nstr, istr)

    new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
  end

  Results.new(results.needle, results.value, new_results).tap do |res|
    res.order     = options[:order]     if options[:order]
    res.threshold = options[:threshold] if options[:threshold]
  end.relevant_results
end