Class: Eco::API::Organization::People::Similarity

Inherits:
Eco::API::Organization::People show all
Includes:
Data::FuzzyMatch
Defined in:
lib/eco/api/organization/people/similarity.rb

Overview

Class to find out duplicates in the People Manager

Constant Summary

Constants inherited from Language::Models::Collection

Language::Models::Collection::BASIC_METHODS, Language::Models::Collection::EXTENDED_METHODS

Config collapse

Config collapse

Searchers collapse

Analisys starters collapse

Results Treatment collapse

Reporting Helpers collapse

Methods included from Data::FuzzyMatch::ClassMethods

#jaro_winkler

Methods included from Data::FuzzyMatch::NGramsScore

#ngrams_score, #words_ngrams_score

Methods included from Data::FuzzyMatch::CharsPositionScore

#chars_position_score

Methods included from Data::FuzzyMatch::Pairing

#paired_words

Methods included from Data::FuzzyMatch::StringHelpers

#get_words, #no_blanks, #normalize_string, #remove_matching_words, #string_combinations, #string_ngrams, #string_permutations, #word_ngrams

Methods included from Data::FuzzyMatch::ArrayHelpers

#combinations, #facet, #ngrams, #permutations

Methods inherited from Eco::API::Organization::People

#[], #contacts, #email_id_maps, #exclude, #exclude!, #exclude_people, #external_id, #filter_tags_all, #filter_tags_any, #find, #group_by_schema, #group_by_supervisor, #id, #initialize, #merge, #missing_supervisors_ids, #non_users, #person, #policy_group_ids_all, #policy_group_ids_any, #similarity, #supervisors, #to_h, #to_json, #uniq, #updated_or_created, #users

Methods inherited from Language::Models::Collection

#<, #<<, #attr, #attr?, attr_collection, attr_presence, #attrs, attrs_create_method, #contains, #delete!, #each, #empty, #empty?, #exclude, #group_by, #initialize, #length, #merge, #new, #present, #present_all?, #present_some?, #remove, #to_c, #to_h, #unique_attrs, #update

Constructor Details

This class inherits a constructor from Eco::API::Organization::People

Instance Attribute Details

#attributeObject



15
16
17
# File 'lib/eco/api/organization/people/similarity.rb', line 15

def attribute
  @attribute ||= :name
end

#orderObject



35
36
37
# File 'lib/eco/api/organization/people/similarity.rb', line 35

def order
  @order ||= %i[words_ngrams dice]
end

#thresholdObject



43
44
45
# File 'lib/eco/api/organization/people/similarity.rb', line 43

def threshold
  @threshold ||= 0.15
end

Instance Method Details

#analyse(needle_read: nil, keep_empty: false, **options) ⇒ Hash

Analyses People bases on options

Parameters:

  • needle_read (Proc, Symbol) (defaults to: nil)

    when the value to read from needle object is different to the :read (attribute). This allows to for example, facet needle.name (needle_read) against haystack_item.details[alt_id] (read).

  • keep_empty (Boolean) (defaults to: false)

    to indicate if it should get rid of people with no results (based on threshold)

Returns:

  • (Hash)

    where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/eco/api/organization/people/similarity.rb', line 123

def analyse(needle_read: nil, keep_empty: false, **options)
  options = { read: attribute }.merge(options)
  total   = count
  i       = 1

  each_with_object({}) do |person, results|
    needle_str = needle_read ? item_string(person, needle_read) : nil
    results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)

    print_progress('Analysed', total, i)
    i += 1
  end.then do |analysed|
    analysed = clean_empty(analysed) unless keep_empty
    #puts "... #{analysed.count} results after cleaning empty"
    analysed
  end
end

#attribute_presentEco::API::Organization::People::Similarity

It returns all the entries with attribute n0t empty



102
103
104
105
106
107
108
# File 'lib/eco/api/organization/people/similarity.rb', line 102

def attribute_present
  reject do |person|
    item_value(person).to_s.strip.length < 2
  end.then do |results|
    newFrom(results)
  end
end

#blank_attributeEco::API::Organization::People::Similarity

It returns all the entries with attribute empty



92
93
94
95
96
97
98
# File 'lib/eco/api/organization/people/similarity.rb', line 92

def blank_attribute
  select do |person|
    item_value(person).to_s.strip.length < 2
  end.then do |results|
    newFrom(results)
  end
end

#clean_empty(analysed) ⇒ Object

Removes from results those that do not have similar entries



166
167
168
169
170
# File 'lib/eco/api/organization/people/similarity.rb', line 166

def clean_empty(analysed)
  analysed.reject do |_id, results|
    results.empty?
  end
end

#ignore_matching_words(analysed, **options) ⇒ Object

Renalyses by ignoring matching words between the needle and those found in results



208
209
210
211
212
213
214
# File 'lib/eco/api/organization/people/similarity.rb', line 208

def ignore_matching_words(analysed, **options)
  prompt = 'Reanalysing by ignoring matching words'

  reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, _needle, _item|
    self.class.remove_matching_words(needle_str, item_str)
  end
end

#ignore_matching_words_old(analysed, **options) ⇒ Object

Renalyses by ignoring matching words between the needle and those found in results



217
218
219
220
221
222
223
224
225
226
227
# File 'lib/eco/api/organization/people/similarity.rb', line 217

def ignore_matching_words_old(analysed, **options)
  options = { read: attribute }.merge(options)
  total   = analysed.count
  i       = 1

  with_analysed(analysed) do |_person, results|
    print_progress('Reanalysing by ignoring matching words', total, i)
    i += 1
    ignore_same_words_score(results, **options)
  end
end

#item_value(item) ⇒ Object

Returns the target value to analyse

Parameters:

  • person (Ecoportal::API::V1::Person)


21
22
23
24
25
26
# File 'lib/eco/api/organization/people/similarity.rb', line 21

def item_value(item)
  return attribute.call(item) if attribute.is_a?(Proc)

  attr = attribute.to_sym
  item.send(attr) if item.respond_to?(attr)
end

#namedEco::API::Organization::People::Similarity

It returns all people with no name



82
83
84
85
86
87
88
# File 'lib/eco/api/organization/people/similarity.rb', line 82

def named
  reject do |person|
    person.name.to_s.strip.length < 2
  end.then do |results|
    newFrom(results)
  end
end

#newFrom(data) ⇒ Eco::API::Organization::People::Similarity

Generates a new object with same config but different base data.



49
50
51
52
53
54
55
# File 'lib/eco/api/organization/people/similarity.rb', line 49

def newFrom(data) # rubocop:disable Naming/MethodName
  super.tap do |simil|
    simil.threshold = threshold
    simil.order     = order
    simil.attribute = attribute
  end
end

#newSimilarity(analysed) ⇒ Eco::API::Organization::People::Similarity

Gets a new instance object of this class, with only people in results

Parameters:

  • analysed (Hash)

    where the keys are the people ids and values the Eco::Data::FuzzyMatch::Results

Returns:



148
149
150
# File 'lib/eco/api/organization/people/similarity.rb', line 148

def newSimilarity(analysed) # rubocop:disable Naming/MethodName
  newFrom(people_in_results(analysed))
end

#people_in_results(analysed) ⇒ Object



152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/eco/api/organization/people/similarity.rb', line 152

def people_in_results(analysed)
  analysed.each_with_object([]) do |(id, results), people|
    # spot related
    results.each_with_object([self[id]]) do |result, related|
      related << result.match
    end.each do |person|
      next if people.include?(person)

      people << person
    end
  end
end
Note:
  1. Unless :analysed is provided, it launches an analysis cutting with Jaro Winker min 0.5
  2. It then re-sorts and cuts based on options

Returns where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results.

Returns:

  • (Hash)

    where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results



249
250
251
252
253
254
255
256
# File 'lib/eco/api/organization/people/similarity.rb', line 249

def print_analysis(**options)
  analysed   = options[:analysed]
  analysed ||= results_with_false_positives.analyse(**options)

  analysed.each_with_object({}) do |(id, results), out|
    puts report(analysed)
  end
end

#reanalyse(analysed, msg: 'Reanalysing', **options, &block) ⇒ Object

Reanalyses by using a block to treat the needle and item values



195
196
197
198
199
200
201
202
203
204
205
# File 'lib/eco/api/organization/people/similarity.rb', line 195

def reanalyse(analysed, msg: 'Reanalysing', **options, &block)
  options = { read: attribute }.merge(options)
  total   = analysed.count
  i       = 1

  with_analysed(analysed) do |_person, results|
    print_progress(msg, total, i)
    i += 1
    recalculate_results(results, &block)
  end
end

#rearrange(analysed, **options) ⇒ Object

Launches a reanalyis on analysed based on options

Parameters:

  • analysed (Hash)

    where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results



188
189
190
191
192
# File 'lib/eco/api/organization/people/similarity.rb', line 188

def rearrange(analysed, **options)
  with_analysed(analysed) do |_person, results|
    results.relevant_results(**options)
  end
end

#repeated_emailsHash

It gathers those that have the same email

Returns:

  • (Hash)

    where keys are emails and values an Array<Person>



63
64
65
66
67
68
# File 'lib/eco/api/organization/people/similarity.rb', line 63

def repeated_emails
  init_caches
  @by_email.select do |, people|
    people.count > 1
  end
end

#report(analysed, format: :txt) ⇒ String

Returns well structured text.

Returns:

  • (String)

    well structured text



234
235
236
237
238
239
240
241
242
243
# File 'lib/eco/api/organization/people/similarity.rb', line 234

def report(analysed, format: :txt)
  case format
  when :txt
    analysed.each_with_object('') do |(id, results), out|
      out << "#{self[id].identify}:\n  "
      out << results.results.map(&:print).join("\n  ")
      out << "\n"
    end
  end
end

#unnamedEco::API::Organization::People::Similarity

It returns all people with no name



72
73
74
75
76
77
78
# File 'lib/eco/api/organization/people/similarity.rb', line 72

def unnamed
  select do |person|
    person.name.to_s.strip.length < 2
  end.then do |results|
    newFrom(results)
  end
end

#with_analysed(analysed, keep_empty: false) ⇒ Hash

Helper to do some treatment fo the results

Parameters:

  • analysed (Hash)

    where the keys are the people ids and values the Eco::Data::FuzzyMatch::Results

Returns:

  • (Hash)

    where the keys are the people ids and values the Eco::Data::FuzzyMatch::Results



175
176
177
178
179
180
181
182
183
184
# File 'lib/eco/api/organization/people/similarity.rb', line 175

def with_analysed(analysed, keep_empty: false)
  analysed.each_with_object({}) do |(id, results), reanalysed|
    reanalysed[id] = yield(self[id], results)
  end.then do |reanalysed|
    reanalysed = clean_empty(reanalysed) unless keep_empty
    reanalysed
  end.tap do |out|
    puts "with_analysed... returns #{out.count} records"
  end
end