Module: Eco::Data::FuzzyMatch::InstanceMethods
- Includes:
- StopWords
- Defined in:
- lib/eco/data/fuzzy_match.rb
Constant Summary collapse
- FUZZY_MATCH_OPTIONS =
%i[ identities groupings stop_words read must_match_grouping must_match_at_least_one_word gather_last_result threshold ].freeze
- JARO_OPTIONS =
%i[ignore_case weight].freeze
- NGRAMS_OPTIONS =
%i[range].freeze
- POSITION_OPTIONS =
%i[max_distance].freeze
- RESULTS_OPTIONS =
%i[order threshold].freeze
Constants included from StopWords
StopWords::ARTICLES, StopWords::PREPOSITIONS, StopWords::PRONOUNS
Instance Attribute Summary collapse
Instance Method Summary collapse
- #find_all_with_score(needle, needle_str: nil, haystack: nil, **options) ⇒ Eco::Data::FuzzyMatch::Results
- #fuzzy_match(haystack_data = nil, **options) ⇒ Object
-
#recalculate_results(results, needle_str: nil, **options) ⇒ Object
rubocop:disable Metrics/AbcSize.
Instance Attribute Details
#fuzzy_options ⇒ Object
57 58 59 |
# File 'lib/eco/data/fuzzy_match.rb', line 57 def @fuzzy_options ||= {} end |
Instance Method Details
#find_all_with_score(needle, needle_str: nil, haystack: nil, **options) ⇒ Eco::Data::FuzzyMatch::Results
TODO:
integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
Note:
- When the
haystackelements are nonStringobjects, it excludes the needle itself from the results
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/eco/data/fuzzy_match.rb', line 84 def find_all_with_score(needle, needle_str: nil, haystack: nil, **) # rubocop:disable Metrics/AbcSize base_match = fuzzy_match(haystack, **) match_results = base_match.find_all_with_score(needle_str || needle) needle_str ||= item_string(needle) results = match_results.each_with_object([]) do |fuzzy_results, results| item, dice, lev = fuzzy_results next if item == needle item_str = item_string(item) if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty? dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0 end jaro_res ||= jaro(needle_str, item_str) ngram_res ||= ngram(needle_str, item_str) wngram_res ||= words_ngram(needle_str, item_str) pos_res ||= position(needle_str, item_str) results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end Results.new(needle, needle_str, results).tap do |res| res.order = [:order] if [:order] res.threshold = [:threshold] if [:threshold] end.relevant_results end |
#fuzzy_match(haystack_data = nil, **options) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/eco/data/fuzzy_match.rb', line 61 def fuzzy_match(haystack_data = nil, **) if instance_variable_defined?(:@fuzzy_match) && !haystack_data return @fuzzy_match if == () # rubocop:disable Style/SoleNestedConditional end @fuzzy_options = # make it run with a native C extension (for better performance: ~130 % increase of performance) require 'fuzzy_match' require 'amatch' ::FuzzyMatch.engine = :amatch @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), ) end |
#recalculate_results(results, needle_str: nil, **options) ⇒ Object
rubocop:disable Metrics/AbcSize
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# File 'lib/eco/data/fuzzy_match.rb', line 112 def recalculate_results(results, needle_str: nil, **) # rubocop:disable Metrics/AbcSize msg = "You should provide a block |needle_str, item_str, needle, item|" raise msg unless block_given? new_results = results.each_with_object([]) do |result, new_results| nstr, istr = yield( needle_str || results.value, result.value, results.needle, result.match ) if istr.to_s.strip.empty? dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1 elsif nstr.to_s.strip.empty? unless (istr = needle_str) dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0 end end require 'fuzzy_match' require 'amatch' res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev dice ||= res&.dices_coefficient_similar || 0 lev ||= res&.levenshtein_similar || 0 jaro_res ||= jaro(nstr, istr) ngram_res ||= ngram(nstr, istr) wngram_res ||= words_ngram(nstr, istr) pos_res ||= position(nstr, istr) new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end Results.new(results.needle, results.value, new_results).tap do |res| res.order = [:order] if [:order] res.threshold = [:threshold] if [:threshold] end.relevant_results end |