Class: NERFeatures

Inherits:
Object
  • Object
show all
Includes:
SimpleDSL
Defined in:
lib/rbbt/ner/rner.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file = nil, reverse = false, &block) ⇒ NERFeatures

Returns a new instance of NERFeatures.



34
35
36
37
38
39
40
41
42
43
# File 'lib/rbbt/ner/rner.rb', line 34

def initialize(file = nil, reverse = false, &block)
  @types   = {}
  @order   = []
  @context = []
  @reverse = reverse

  file ||= Rbbt.share.ner['config.rb'].find if !file && !block

  parse(:define, file, &block)
end

Instance Attribute Details

#reverseObject

Returns the value of attribute reverse.



33
34
35
# File 'lib/rbbt/ner/rner.rb', line 33

def reverse
  @reverse
end

Class Method Details

.reverse(text) ⇒ Object



19
20
21
# File 'lib/rbbt/ner/rner.rb', line 19

def self.reverse(text)
  tokens(text).reverse.join(" ")
end

.tokens(text) ⇒ Object



9
10
11
12
13
14
15
16
17
# File 'lib/rbbt/ner/rner.rb', line 9

def self.tokens(text)
  text.scan(/
            \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
            \w-\w*|
            \w+-[A-Z](?!\w)|
            \w+|
            [.,()\/\[\]{}'"+-]
            /x)
end

Instance Method Details

#configObject



45
46
47
# File 'lib/rbbt/ner/rner.rb', line 45

def config
  @config[:define]
end

#context(name, &block) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/rbbt/ner/rner.rb', line 53

def context(name, &block)
  if name.is_a? Array
    @context += name
  else
    @context.push name

    # The block might be wrongly assigned to this function
    # instead of the actual definition, fix that.
    if block
      @types[name] = block
    end
  end
end

#define(name, *args, &block) ⇒ Object



23
24
25
26
27
28
29
30
31
# File 'lib/rbbt/ner/rner.rb', line 23

def define(name, *args, &block)
  action = args[0] || block ||  /#{name.to_s}s?/i
  raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))

  @types[name.to_s] = action
  @order.push name.to_s

  name.to_s
end

#direction(dir) ⇒ Object



67
68
69
70
71
# File 'lib/rbbt/ner/rner.rb', line 67

def direction(dir)
  if dir.to_sym == :reverse
    @reverse = true
  end
end

#features(word) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/rbbt/ner/rner.rb', line 73

def features(word)
  values = [word]

  @order.each{|features|
    action = @types[features]
    if action.is_a?(Proc)
      values.push(action.call(word))
    else
      m = action.match(word)
      if m
        if m[1]
          values.push(m[1])
        else
          values.push(m != nil)
        end
      else
        values.push(false)
      end
    end
  }
  values
end

#tagged_features(text, mentions) ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/rbbt/ner/rner.rb', line 131

def tagged_features(text, mentions)
  mentions ||= []
  mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
  re = mentions.collect{|mention|
    Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
  }.join("|")

  positive = false
  features = []
  chunks = text.split(/(#{re})/)
  chunks.each{|t|
    chunk_features = text_features(t, positive)
    positive = !positive
    if @reverse
      features = chunk_features + features
    else
      features = features + chunk_features
    end
  }
  features
end

#template(window = nil) ⇒ Object



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/rbbt/ner/rner.rb', line 96

def template(window=nil)
  window ||= @window || [1,-1]
  template = ""

  i = 1
  @order.each{|feat|
    template += "U#{ feat }: %x[0,#{ i }]\n"

    if @context.include?(feat)
      window.each{|p|
        template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
      }
    end
    i += 1
  }
    
  template += "B\n"

  template
end

#text_features(text, positive = nil) ⇒ Object



118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/rbbt/ner/rner.rb', line 118

def text_features(text, positive = nil)
  text = self.class.reverse(text) if @reverse
  initial = true
  self.class.tokens(text).collect{|token|
    features = features(token)
    if !positive.nil?
      features << (positive ? (initial ? 1 : 2) : 0)
      initial = false
    end
    features
  }
end

#train(features, model) ⇒ Object



153
154
155
156
157
158
159
160
161
# File 'lib/rbbt/ner/rner.rb', line 153

def train(features, model)
  tmp_template = TmpFile.tmp_file("template-")
  Open.write(tmp_template,template)

  cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}'  '#{features}' '#{model}'"
  system cmd
  Open.write(model + '.config',config)
  FileUtils.rm tmp_template
end

#window(positions) ⇒ Object



49
50
51
# File 'lib/rbbt/ner/rner.rb', line 49

def window(positions)
  @window = positions
end