20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
# File 'lib/rbbt/ner/oscar3.rb', line 20
def self.match(text, type = nil, memm = false)
self.init
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
mentions = []
it = doc.getTokenSequences().iterator
type = [type] unless type.nil? or Array === type
while it.hasNext do
sequence = it.next
sequence_str = sequence.getSourceString.to_s
sequence_offset = sequence.offset.to_i
offset = 0
while text[(sequence_offset + offset)..(sequence_offset + offset + sequence_str.length - 1)] != sequence_str and
not offset + sequence_offset + sequence_str.length > text.length
offset += 1
end
next if offset + sequence_offset + sequence_str.length > text.length
if memm
entities = @@MEMM.findNEs(sequence, text)
keys = entities.keySet.iterator
else
entities = @@DFA.getNEs(sequence)
keys = entities.iterator
end
while keys.hasNext do
key = keys.next
mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
next unless type.nil? or type.include? mention_type
score = memm ? entities.get(key).to_string.to_f : nil
NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
end
end
mentions
end
|