Class: RegExpNER

Inherits:
NER
  • Object
show all
Includes:
SimpleDSL
Defined in:
lib/rbbt/ner/regexpNER.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from NER

#entities

Constructor Details

#initialize(regexps = {}) ⇒ RegExpNER

Returns a new instance of RegExpNER.



70
71
72
# File 'lib/rbbt/ner/regexpNER.rb', line 70

def initialize(regexps = {})
  @regexps = regexps.collect{|p| p }
end

Instance Attribute Details

#regexpsObject

Returns the value of attribute regexps.



69
70
71
# File 'lib/rbbt/ner/regexpNER.rb', line 69

def regexps
  @regexps
end

Class Method Details

.match_regexp(text, regexp, type = nil) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/rbbt/ner/regexpNER.rb', line 8

def self.match_regexp(text, regexp, type = nil)
  matches = []
  start = 0
  while matchdata = text.match(regexp)
    pre   = matchdata.pre_match
    post  = matchdata.post_match
    match = matchdata[0]

    if matchdata.captures.any?
      capture = matchdata.captures.first
      more_pre, more_post = match.split(/#{capture}/)

      match = capture
      pre << more_pre if more_pre
      post = more_post << post if more_post
    end

    if match and not match.empty?
      NamedEntity.setup(match, start + pre.length, type)
      matches << match
    end

    start += pre.length + match.length
    text = post
  end

  matches
end

.match_regexp_hash(text, regexp_hash) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/rbbt/ner/regexpNER.rb', line 51

def self.match_regexp_hash(text, regexp_hash)
  matches = []

  regexp_hash.each do |type, regexp_list|
    regexp_list = [regexp_list] unless Array === regexp_list
    chunks = Segment.split(text, matches)
    chunks.each do |chunk|
      chunk_offset = chunk.offset
      match_regexp_list(chunk, regexp_list, type).each do |match| 
        match.offset = match.offset + chunk_offset; 
        matches << match 
      end
    end
  end

  matches
end

.match_regexp_list(text, regexp_list, type = nil) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/rbbt/ner/regexpNER.rb', line 37

def self.match_regexp_list(text, regexp_list, type = nil)
  matches = []

  regexp_list.each do |regexp|
    chunks = Segment.split(text, matches)
    chunks.each do |chunk|
      new_matches = match_regexp(chunk, regexp, type)
      new_matches.each do |match| match.offset += chunk.offset; matches << match end
    end
  end

  matches
end

Instance Method Details

#__define_regexp_hook(name, regexp, *args) ⇒ Object



78
79
80
# File 'lib/rbbt/ner/regexpNER.rb', line 78

def __define_regexp_hook(name, regexp, *args)
  @regexps << [name, regexp]
end

#add_regexp(list = {}) ⇒ Object



86
87
88
# File 'lib/rbbt/ner/regexpNER.rb', line 86

def add_regexp(list = {})
  @regexps.concat list.collect
end

#define_regexp(*args, &block) ⇒ Object



82
83
84
# File 'lib/rbbt/ner/regexpNER.rb', line 82

def define_regexp(*args, &block)
  load_config("__define_regexp_hook", *args, &block)
end

#match(text) ⇒ Object



90
91
92
93
94
95
# File 'lib/rbbt/ner/regexpNER.rb', line 90

def match(text)
  matches = RegExpNER.match_regexp_hash(text, @regexps)
  matches.collect do |m|
    NamedEntity.setup(m, :offset => m.offset, :type =>  m.type, :code => m)
  end
end

#token_score(*args) ⇒ Object



74
75
76
# File 'lib/rbbt/ner/regexpNER.rb', line 74

def token_score(*args)
  1
end