11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
# File 'lib/entifier.rb', line 11
def self.(string, options = {})
options.nested_stringify_keys!
entities = []
string.gsub!(/[[:blank:]]+/, "\s")
string.gsub!(/\s\(/, " (")
capitalised_word = /[ÄÅÖA-Z](?:[a-zA-ZÄÅÖÜàâæçéèêëîïôøöûùüÿñ\-\d\&]+|\.(?:[A-Z]\.)*)/
capitalised_word_phrase = %r{
(?:\d{4}\s|Dr\.\s)?
#{capitalised_word}
(?:
(?:
(?:\s+(?:of|for|on|of\sthe|\&|d\'|du|de)|\'s)
)?
\s+#{capitalised_word})*
(?:\s\d+)?
}x
regex = %r{
(?:
(?:
(?:\A|[\.\?\!\:][\"\']?\s+|\n) # At start of string, or starting new sentence...
(?:[\"\'\(])? # ...optionally started with quote marks.
)
(
(?:In\s(?:\d{4}\s)?)?
#{capitalised_word_phrase}(?:\'s)?
)
| # --- OR ---
[^\.\n\?\!\:\"][[:blank:]][\"\'\(]? # After any non-full-stop followed by a space...
(#{capitalised_word_phrase})
)
}x
string.scan(regex) do |match|
if match[0]
word_count = match[0].split(" ").size
if word_count > 1
entity = match[0].gsub(/\A(In(?:\s\d{4})?|The|If|But|Two|(?:One|Two)\sof)\s/, "").gsub(/\'s\Z/, "")
elsif match[0][-2,2] == "'s"
entity = match[0].gsub(/\'s\Z/, "")
elsif match[0] =~ /\A[A-Z]+\Z/
entity = match[0]
else
entity = nil
end
else
entity = match[1]
end
if entity
end
entity = nil if DAY_NAMES.include?(entity)
entity = nil if MONTH_NAMES.include?(entity)
entity = nil if INDEXICALS_PRECEDING_APOSTROPHE_S.include?(entity)
if entity
entity.gsub!( /((I|i)n\s)(January|Feburary|March|April|May|June|July|August|September|October|November|December)/, "")
entity.gsub!( /(January|Feburary|March|April|May|June|July|August|September|October|November|December)\s(\d{4}|\d{2})/, "")
entity.gsub!(/(O|\so)n\s(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)/, "")
entity.gsub!(/\A\d+\Z/, "") entity = nil if entity == ""
end
entities << entity unless entity.nil? end
entities.uniq!
return entities
end
|