Class: Eco::API::Organization::PeopleSimilarity

Inherits:
People show all
Includes:
Data::FuzzyMatch
Defined in:
lib/eco/api/organization/people_similarity.rb

Overview

Class to find out duplicates in the People Manager

Constant Summary

Constants inherited from Language::Models::Collection

Language::Models::Collection::BASIC_METHODS, Language::Models::Collection::EXTENDED_METHODS

Config collapse

Config collapse

Searchers collapse

Analisys starters collapse

Results Treatment collapse

Reporting Helpers collapse

Methods included from Data::FuzzyMatch::ClassMethods

#jaro_winkler

Methods included from Data::FuzzyMatch::NGramsScore

#ngrams_score, #words_ngrams_score

Methods included from Data::FuzzyMatch::CharsPositionScore

#chars_position_score

Methods included from Data::FuzzyMatch::Pairing

#paired_words

Methods included from Data::FuzzyMatch::StringHelpers

#get_words, #no_blanks, #normalize_string, #remove_matching_words, #string_combinations, #string_ngrams, #string_permutations, #word_ngrams

Methods included from Data::FuzzyMatch::ArrayHelpers

#combinations, #facet, #ngrams, #permutations

Methods inherited from People

#[], #contacts, #email_id_maps, #exclude, #exclude!, #exclude_people, #external_id, #filter_tags_all, #filter_tags_any, #find, #group_by_schema, #group_by_supervisor, #id, #initialize, #merge, #missing_supervisors_ids, #non_users, #person, #policy_group_ids_all, #policy_group_ids_any, #similarity, #supervisors, #to_h, #to_json, #uniq, #updated_or_created, #users

Methods inherited from Language::Models::Collection

#<, #<<, #attr, #attr?, attr_collection, attr_presence, #attrs, attrs_create_method, #contains, #delete!, #each, #empty, #empty?, #exclude, #group_by, #initialize, #length, #merge, #new, #present, #present_all?, #present_some?, #remove, #to_c, #to_h, #unique_attrs, #update

Constructor Details

This class inherits a constructor from Eco::API::Organization::People

Instance Attribute Details

#attributeObject

Returns the value of attribute attribute.



11
12
13
# File 'lib/eco/api/organization/people_similarity.rb', line 11

def attribute
  @attribute
end

Instance Method Details

#analyse(needle_read: nil, keep_empty: false, **options) ⇒ Hash

Analyses People bases on options

Parameters:

  • needle_read (Proc, Symbol) (defaults to: nil)

    when the value to read from needle object is different to the :read (attribute). This allows to for example, facet needle.name (needle_read) against haystack_item.details[alt_id] (read).

  • keep_empty (Boolean) (defaults to: false)

    to indicate if it should get rid of people with no results (based on threshold)

Returns:

  • (Hash)

    where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results



124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/eco/api/organization/people_similarity.rb', line 124

def analyse(needle_read: nil, keep_empty: false, **options)
  options = { read: self.attribute }.merge(options)
  total = count; i = 1
  each_with_object({}) do |person, results|
    needle_str = needle_read ? item_string(person, needle_read) : nil
    results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)
    print_progress("Analysed", total, i)
    i += 1
  end.yield_self do |analysed|
    analysed = clean_empty(analysed) unless keep_empty
    #puts "... #{analysed.count} results after cleaning empty"
    analysed
  end
end

#attribute_presentEco::API::Organization::PeopleSimilarity

It returns all the entries with attribute n0t empty



107
108
109
110
111
112
113
# File 'lib/eco/api/organization/people_similarity.rb', line 107

def attribute_present
  reject do |person|
    item_value(person).to_s.strip.length < 2
  end.yield_self do |results|
    newFrom(results)
  end
end

#blank_attributeEco::API::Organization::PeopleSimilarity

It returns all the entries with attribute empty



97
98
99
100
101
102
103
# File 'lib/eco/api/organization/people_similarity.rb', line 97

def blank_attribute
  select do |person|
    item_value(person).to_s.strip.length < 2
  end.yield_self do |results|
    newFrom(results)
  end
end

#clean_empty(analysed) ⇒ Object

Removes from results those that do not have similar entries



160
161
162
163
164
# File 'lib/eco/api/organization/people_similarity.rb', line 160

def clean_empty(analysed)
  analysed.select do |id, results|
    !results.empty?
  end
end

#ignore_matching_words(analysed, **options) ⇒ Object

Renalyses by ignoring matching words between the needle and those found in results



198
199
200
201
202
203
# File 'lib/eco/api/organization/people_similarity.rb', line 198

def ignore_matching_words(analysed, **options)
  prompt = "Reanalysing by ignoring matching words"
  reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item|
    self.class.remove_matching_words(needle_str, item_str)
  end
end

#ignore_matching_words_old(analysed, **options) ⇒ Object

Renalyses by ignoring matching words between the needle and those found in results



206
207
208
209
210
211
212
213
214
# File 'lib/eco/api/organization/people_similarity.rb', line 206

def ignore_matching_words_old(analysed, **options)
  options = { read: self.attribute }.merge(options)
  total = analysed.count; i = 1
  with_analysed(analysed) do |person, results|
    print_progress("Reanalysing by ignoring matching words", total, i)
    i += 1
    ignore_same_words_score(results, **options)
  end
end

#item_value(person) ⇒ Object

Returns the target value to analyse

Parameters:

  • person (Ecoportal::API::V1::Person)


25
26
27
28
29
# File 'lib/eco/api/organization/people_similarity.rb', line 25

def item_value(person)
  return attr.call(item) if attribute.is_a?(Proc)
  attr = attribute.to_sym
  return item.send(attr) if item.respond_to?(attr)
end

#namedEco::API::Organization::PeopleSimilarity

It returns all people with no name



87
88
89
90
91
92
93
# File 'lib/eco/api/organization/people_similarity.rb', line 87

def named
  reject do |person|
    person.name.to_s.strip.length < 2
  end.yield_self do |results|
    newFrom(results)
  end
end

#newFrom(data) ⇒ Eco::API::Organization::PeopleSimilarity

Generates a new object with same config but different base data.



54
55
56
57
58
59
60
# File 'lib/eco/api/organization/people_similarity.rb', line 54

def newFrom(data)
  super(data).tap do |simil|
    simil.threshold = threshold
    simil.order     = order
    simil.attribute = attribute
  end
end

#newSimilarity(analysed) ⇒ Eco::API::Organization::PeopleSimilarity

Gets a new instance object of this class, with only people in results

Parameters:

  • analysed (Hash)

    where the keys are the people ids and values the Eco::Data::FuzzyMatch::Results

Returns:



146
147
148
# File 'lib/eco/api/organization/people_similarity.rb', line 146

def newSimilarity(analysed)
  newFrom(people_in_results(analysed))
end

#orderObject



38
39
40
# File 'lib/eco/api/organization/people_similarity.rb', line 38

def order
  @order ||= [:words_ngrams, :dice]
end

#order=(values) ⇒ Object

Define the order or relevant of per user matches

Parameters:

  • values (Array<Symbol>)

    the algorithms' results it should be ordered by

    • Possible values: :dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position


34
35
36
# File 'lib/eco/api/organization/people_similarity.rb', line 34

def order=(values)
  @order = values
end

#people_in_results(analysed) ⇒ Object



150
151
152
153
154
155
156
157
# File 'lib/eco/api/organization/people_similarity.rb', line 150

def people_in_results(analysed)
  analysed.each_with_object([]) do |(id, results), people|
    related = results.each_with_object([self[id]]) do |result, related|
      related << result.match
    end
    related.each {|person| people << person unless people.include?(person)}
  end
end
Note:
  1. Unless :analysed is provided, it launches an analysis cutting with Jaro Winker min 0.5
  2. It then re-sorts and cuts based on options

Returns where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results.

Returns:

  • (Hash)

    where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results



235
236
237
238
239
240
# File 'lib/eco/api/organization/people_similarity.rb', line 235

def print_analysis(**options)
  analysed = options[:analysed] || results_with_false_positives.analyse(**options)
  analysed.each_with_object({}) do |(id, results), out|
    puts report(analysed)
  end
end

#reanalyse(analysed, msg: "Reanalysing", **options, &block) ⇒ Object

Reanalyses by using a block to treat the needle and item values



187
188
189
190
191
192
193
194
195
# File 'lib/eco/api/organization/people_similarity.rb', line 187

def reanalyse(analysed, msg: "Reanalysing", **options, &block)
  options = { read: self.attribute }.merge(options)
  total = analysed.count; i = 1
  with_analysed(analysed) do |person, results|
    print_progress(msg, total, i)
    i += 1
    recalculate_results(results, &block)
  end
end

#rearrange(analysed, **options) ⇒ Object

Launches a reanalyis on analysed based on options

Parameters:

  • analysed (Hash)

    where the keys are the people ids and the values the Eco::Data::FuzzyMatch::Results



180
181
182
183
184
# File 'lib/eco/api/organization/people_similarity.rb', line 180

def rearrange(analysed, **options)
  with_analysed(analysed) do |person, results|
    results.relevant_results(**options)
  end
end

#repeated_emailsHash

It gathers those that have the same email

Returns:

  • (Hash)

    where keys are emails and values an Array<Person>



68
69
70
71
72
73
# File 'lib/eco/api/organization/people_similarity.rb', line 68

def repeated_emails
  init_caches
  @by_email.select do |email, people|
    people.count > 1
  end
end

#report(analysed, format: :txt) ⇒ String

Returns well structured text.

Returns:

  • (String)

    well structured text



221
222
223
224
225
226
227
228
229
# File 'lib/eco/api/organization/people_similarity.rb', line 221

def report(analysed, format: :txt)
  case
  when format == :txt
    analysed.each_with_object("") do |(id, results), out|
      msg = results.results.map {|r| r.print}.join("\n  ")
      out << "#{self[id].identify}:\n  " + msg + "\n"
    end
  end
end

#thresholdObject



48
49
50
# File 'lib/eco/api/organization/people_similarity.rb', line 48

def threshold
  @threshold ||= 0.15
end

#threshold=(value) ⇒ Object

Define the order or relevant of per user matches

Parameters:

  • value (Float)

    the threshold that all of the algorithms should comply with



44
45
46
# File 'lib/eco/api/organization/people_similarity.rb', line 44

def threshold=(value)
  @threshold = value
end

#unnamedEco::API::Organization::PeopleSimilarity

It returns all people with no name



77
78
79
80
81
82
83
# File 'lib/eco/api/organization/people_similarity.rb', line 77

def unnamed
  select do |person|
    person.name.to_s.strip.length < 2
  end.yield_self do |results|
    newFrom(results)
  end
end

#with_analysed(analysed, keep_empty: false) ⇒ Hash

Helper to do some treatment fo the results

Parameters:

  • analysed (Hash)

    where the keys are the people ids and values the Eco::Data::FuzzyMatch::Results

Returns:

  • (Hash)

    where the keys are the people ids and values the Eco::Data::FuzzyMatch::Results



169
170
171
172
173
174
175
176
# File 'lib/eco/api/organization/people_similarity.rb', line 169

def with_analysed(analysed, keep_empty: false)
  analysed.each_with_object({}) do |(id, results), reanalysed|
    reanalysed[id] = yield(self[id], results)
  end.yield_self do |reanalysed|
    reanalysed = clean_empty(reanalysed) unless keep_empty
    reanalysed
  end.tap {|out| "with_analysed... returns #{out.count} records"}
end