Class: Matlock

Inherits:
Object
  • Object
show all
Defined in:
lib/matlock.rb,
lib/matlock/data.rb,
lib/matlock/version.rb

Defined Under Namespace

Classes: Data

Constant Summary collapse

VERSION =
"0.1.2"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeMatlock

Creates a new matlock object.



15
16
17
# File 'lib/matlock.rb', line 15

def initialize()
  @stopwords = []
end

Instance Attribute Details

#stopwordsObject

A list of stop words to ignore when matching names.



27
28
29
# File 'lib/matlock.rb', line 27

def stopwords
  @stopwords
end

Instance Method Details

#extract_names(content) ⇒ Object

Extracts a list of names from a string.

Parameters:

  • content (String)

    the string that names should be extracted from.



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/matlock.rb', line 46

def extract_names(content)
  names = []
  
  # Split content into words.
  words = content.split(/[^-_a-z0-9]+/i).select {|v| v.index(/^[-a-z]+$/i)}
  
  # Loop over each bigram and check if the words are title cased and if at
  # least one of the words is a first or last name.
  words.each_with_index do |first_name, index|
    surname = full_surname = words[index+1] || ''
    
    # Skip to the next word if we have a couple of the next words.
    if ['van', 'von'].index(surname)
      surname = words[index+2] || ''
      full_surname = "#{full_surname} #{surname}"
    end
    
    # Only look at two words that are titlecase and neither one is a stopword.
    next if !first_name.titlecase? || !surname.titlecase?
    next if !stopwords.index(first_name.upcase).nil? || !stopwords.index(surname.upcase).nil?
    
    # Check if either the first name or last name is a recognized common name.
    if Matlock::Data.first_name?(first_name) || Matlock::Data.surname?(surname)
      full_name = "#{first_name} #{full_surname}"
      names << full_name if names.index(full_name).nil?
    end
  end
  
  return names
end