Class: Taxamatch::Authmatch

Inherits:
Object
  • Object
show all
Defined in:
lib/taxamatch_rb/authmatch.rb

Class Method Summary collapse

Class Method Details

.authmatch(authors1, authors2, years1, years2) ⇒ Object



6
7
8
9
10
# File 'lib/taxamatch_rb/authmatch.rb', line 6

def self.authmatch(authors1, authors2, years1, years2)
  unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
  year_difference = compare_years(years1, years2)
  get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
end

.compare_years(years1, years2) ⇒ Object



83
84
85
86
87
# File 'lib/taxamatch_rb/authmatch.rb', line 83

def self.compare_years(years1, years2)
  return 0 if years1 == [] && years2 == []
  return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
  nil
end

.fuzzy_match_authors(author1, author2) ⇒ Object



75
76
77
78
79
80
81
# File 'lib/taxamatch_rb/authmatch.rb', line 75

def self.fuzzy_match_authors(author1, author2)
  au1_length = author1.size
  au2_length = author2.size
  dlm = Taxamatch::DamerauLevenshteinMod.new
  ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
  (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
end

.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/taxamatch_rb/authmatch.rb', line 12

def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
  count_before = authors1.size + authors2.size
  count_after = unique_authors1.size + unique_authors2.size
  score = 0
  if count_after == 0
    if year_diff != nil
      if year_diff == 0
        score = 100
      elsif year_diff == 1
        score = 54  
      end
    else
      score = 94
    end
  elsif unique_authors1.size == 0 || unique_authors2.size == 0
    if year_diff != nil
      if year_diff == 0
        score = 91
      elsif year_diff == 1
        score = 51
      end
    else
      score = 90
    end
  else
    score = ((1 - count_after.to_f/count_before.to_f) * 100).round
    score = 0 unless year_diff == nil || (year_diff && year_diff == 0)  
  end
  score > 50 ? score : 0
end

.remove_duplicate_authors(authors1, authors2) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/taxamatch_rb/authmatch.rb', line 43

def self.remove_duplicate_authors(authors1, authors2)
  unique_authors1 = authors1.dup
  unique_authors2 = authors2.dup
  authors1.each do |au1|
    authors2.each do |au2|
      au1_match = au2_match = false
      if au1 == au2
        au1_match = au2_match = true
      elsif au1 == au2[0...au1.size]          
        au1_match = true
      elsif au1[0...au2.size] == au2
        au2_match = true
      end
      if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
        unique_authors1.delete au1
        unique_authors2.delete au2
      elsif au1_match
        unique_authors1.delete au1
      elsif au2_match
        unique_authors2.delete au2
      else
        #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
        if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
          unique_authors1.delete au1
          unique_authors2.delete au2
        end
      end
    end
  end
  [unique_authors1, unique_authors2]
end