Class: Matlock

Inherits:
Object
  • Object
show all
Defined in:
lib/matlock.rb,
lib/matlock/data.rb,
lib/matlock/version.rb

Defined Under Namespace

Classes: Data

Constant Summary collapse

VERSION =
"0.1.2"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeMatlock

Creates a new matlock object.



15
16
17
# File 'lib/matlock.rb', line 15

def initialize()
  @stopwords = []
end

Instance Attribute Details

#stopwordsObject

A list of stop words to ignore when matching names.



27
28
29
# File 'lib/matlock.rb', line 27

def stopwords
  @stopwords
end

Instance Method Details

#extract_names(content) ⇒ Object

Extracts a list of names from a string.



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/matlock.rb', line 46

def extract_names(content)
  names = []
  
  # Split content into words.
  words = content.split(/[^-_a-z0-9]+/i).select {|v| v.index(/^[-a-z]+$/i)}
  
  # Loop over each bigram and check if the words are title cased and if at
  # least one of the words is a first or last name.
  words.each_with_index do |first_name, index|
    surname = full_surname = words[index+1] || ''
    
    # Skip to the next word if we have a couple of the next words.
    if ['van', 'von'].index(surname)
      surname = words[index+2] || ''
      full_surname = "#{full_surname} #{surname}"
    end
    
    # Only look at two words that are titlecase and neither one is a stopword.
    next if !first_name.titlecase? || !surname.titlecase?
    next if !stopwords.index(first_name.upcase).nil? || !stopwords.index(surname.upcase).nil?
    
    # Check if either the first name or last name is a recognized common name.
    if Matlock::Data.first_name?(first_name) || Matlock::Data.surname?(surname)
      full_name = "#{first_name} #{full_surname}"
      names << full_name if names.index(full_name).nil?
    end
  end
  
  return names
end