Class: Entifier

Inherits:
Object
  • Object
show all
Defined in:
lib/entifier.rb

Constant Summary collapse

DAY_NAMES =
["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
MONTH_NAMES =
["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
INDEXICALS_PRECEDING_APOSTROPHE_S =
["Here", "There", "He", "She", "It", "Now", "Who"]

Class Method Summary collapse

Class Method Details

.extract(string, options = {}) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/entifier.rb', line 11

def self.extract(string, options = {})
  options.nested_stringify_keys!
  
  entities = []
  
  # HACK! This allows comma separated entities to be picked up.
  # string.gsub!(/\,\s/, ",  ")  

  # HACK! This allows entities to be at the end and start of conjoining sentences.
  # string.gsub!(/\./, ".")   No longer needed

  # HACK! Remove extra spaces between sentences.
  #string.gsub!(/([\.\?\!])\s\s+/, "\1")    

  # Pre-processor: remove extra spaces (this shouldn't affect which entities are detected, 
  # it just makes the output look better)
  string.gsub!(/[[:blank:]]+/, "\s")

  # HACK! This allows parenthesized entity following other entity to be picked up.
  string.gsub!(/\s\(/, "  (")
   
  
   capitalised_word = /[ÄÅÖA-Z](?:[a-zA-ZÄÅÖÜàâæçéèêëîïôøöûùüÿñ\-\d\&]+|\.(?:[A-Z]\.)*)/
   capitalised_word_phrase = %r{
    (?:\d{4}\s|Dr\.\s)?
    #{capitalised_word}
    (?:
      (?:
        (?:\s+(?:of|for|on|of\sthe|\&|d\'|du|de)|\'s)
      )?
      \s+#{capitalised_word})*
    (?:\s\d+)?
  }x
   
  
  regex = %r{
    (?:
      (?:
      (?:\A|[\.\?\!\:][\"\']?\s+|\n)             # At start of string, or starting new sentence...
        (?:[\"\'\(])?                       # ...optionally started with quote marks.
        )
        (
          (?:In\s(?:\d{4}\s)?)? 
          #{capitalised_word_phrase}(?:\'s)? 
        )
      |                                   # --- OR ---

      [^\.\n\?\!\:\"][[:blank:]][\"\'\(]?           # After any non-full-stop followed by a space...                     

        (#{capitalised_word_phrase})
    )
  }x

  
  #[\,\'\s\.\Z]
  
  string.scan(regex) do |match|
    #entity = match
    if match[0]
      word_count = match[0].split(" ").size
      if word_count > 1
        entity = match[0].gsub(/\A(In(?:\s\d{4})?|The|If|But|Two|(?:One|Two)\sof)\s/, "").gsub(/\'s\Z/, "")
      elsif match[0][-2,2] == "'s"
        entity = match[0].gsub(/\'s\Z/, "")  
      elsif match[0] =~ /\A[A-Z]+\Z/        
        entity = match[0]
      else
        entity = nil
      end
    else
      entity = match[1]
    end
    
    # HACK: These should really be filtered out by the regex.
    if entity
     # entity = entity.strip.gsub(/\'s\Z/, "").gsub(/\AIn\s/, "").gsub(/\AIf\s/, "")
    end
    entity = nil if DAY_NAMES.include?(entity)
    entity = nil if MONTH_NAMES.include?(entity)
    entity = nil if INDEXICALS_PRECEDING_APOSTROPHE_S.include?(entity)
 
    
    if entity
      entity.gsub!( /((I|i)n\s)(January|Feburary|March|April|May|June|July|August|September|October|November|December)/, "")
      entity.gsub!( /(January|Feburary|March|April|May|June|July|August|September|October|November|December)\s(\d{4}|\d{2})/, "")
      entity.gsub!(/(O|\so)n\s(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)/, "")

      entity.gsub!(/\A\d+\Z/, "")   # Make string blank if it's just numbers.
      entity = nil if entity == ""  # If there's nothing left, make it nil.

    end
    
    
    entities << entity unless entity.nil?  # Don't collect the entity if it's nil
  end
  entities.uniq!
  return entities
end