Class: NERFeatures

Inherits:
SimpleDSL show all
Defined in:
lib/rbbt/ner/rner.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from SimpleDSL

#parse

Constructor Details

#initialize(file = nil, reverse = false, &block) ⇒ NERFeatures

Returns a new instance of NERFeatures.



32
33
34
35
36
37
38
39
40
41
# File 'lib/rbbt/ner/rner.rb', line 32

def initialize(file = nil, reverse = false, &block)
  @types   = {}
  @order   = []
  @context = []
  @reverse = reverse

  file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block

  super(:define,file, &block)
end

Instance Attribute Details

#reverseObject

Returns the value of attribute reverse.



31
32
33
# File 'lib/rbbt/ner/rner.rb', line 31

def reverse
  @reverse
end

Class Method Details

.reverse(text) ⇒ Object



17
18
19
# File 'lib/rbbt/ner/rner.rb', line 17

def self.reverse(text)
  tokens(text).reverse.join(" ")
end

.tokens(text) ⇒ Object



7
8
9
10
11
12
13
14
15
# File 'lib/rbbt/ner/rner.rb', line 7

def self.tokens(text)
  text.scan(/
            \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
            \w-\w*|
            \w+-[A-Z](?!\w)|
            \w+|
            [.,()\/\[\]{}'"+-]
            /x)
end

Instance Method Details

#configObject



43
44
45
# File 'lib/rbbt/ner/rner.rb', line 43

def config
  @config[:define]
end

#context(name, &block) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/rbbt/ner/rner.rb', line 51

def context(name, &block)
  if name.is_a? Array
    @context += name
  else
    @context.push name

    # The block might be wrongly assigned to this function
    # instead of the actual definition, fix that.
    if block
      @types[name] = block
    end
  end
end

#define(name, *args, &block) ⇒ Object



21
22
23
24
25
26
27
28
29
# File 'lib/rbbt/ner/rner.rb', line 21

def define(name, *args, &block)
  action = *args[0] || block ||  /#{name.to_s}s?/i
  raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))

  @types[name.to_s] = action
  @order.push name.to_s

  name.to_s
end

#direction(dir) ⇒ Object



65
66
67
68
69
# File 'lib/rbbt/ner/rner.rb', line 65

def direction(dir)
  if dir.to_sym == :reverse
    @reverse = true
  end
end

#features(word) ⇒ Object



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/rbbt/ner/rner.rb', line 71

def features(word)
  values = [word]

  @order.each{|features|
    action = @types[features]
    if action.is_a?(Proc)
      values.push(action.call(word))
    else
      m = action.match(word)
      if m
        if m[1]
          values.push(m[1])
        else
          values.push(m != nil)
        end
      else
        values.push(false)
      end
    end
  }
  values
end

#tagged_features(text, mentions) ⇒ Object



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/rbbt/ner/rner.rb', line 129

def tagged_features(text, mentions)
  mentions ||= []
  mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
  re = mentions.collect{|mention|
    Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
  }.join("|")

  positive = false
  features = []
  chunks = text.split(/(#{re})/)
  chunks.each{|t|
    chunk_features = text_features(t, positive)
    positive = !positive
    if @reverse
      features = chunk_features + features
    else
      features = features + chunk_features
    end
  }
  features
end

#template(window = nil) ⇒ Object



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/rbbt/ner/rner.rb', line 94

def template(window=nil)
  window ||= @window || [1,-1]
  template = ""

  i = 1
  @order.each{|feat|
    template += "U#{ feat }: %x[0,#{ i }]\n"

    if @context.include?(feat)
      window.each{|p|
        template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
      }
    end
    i += 1
  }
    
  template += "B\n"

  template
end

#text_features(text, positive = nil) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/rbbt/ner/rner.rb', line 116

def text_features(text, positive = nil)
  text = self.class.reverse(text) if @reverse
  initial = true
  self.class.tokens(text).collect{|token|
    features = features(token)
    if !positive.nil?
      features << (positive ? (initial ? 1 : 2) : 0)
      initial = false
    end
    features
  }
end

#train(features, model) ⇒ Object



151
152
153
154
155
156
157
158
159
# File 'lib/rbbt/ner/rner.rb', line 151

def train(features, model)
  tmp_template = TmpFile.tmp_file("template-")
  Open.write(tmp_template,template)

  cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}'  '#{features}' '#{model}'"
  system cmd
  Open.write(model + '.config',config)
  FileUtils.rm tmp_template
end

#window(positions) ⇒ Object



47
48
49
# File 'lib/rbbt/ner/rner.rb', line 47

def window(positions)
  @window = positions
end