Class: Tokenizer
- Inherits:
-
Object
show all
- Includes:
- SimpleDSL
- Defined in:
- lib/rbbt/ner/rnorm/tokens.rb
Defined Under Namespace
Classes: Custom, Operation, Transform
Constant Summary
collapse
- GREEK_RE =
"(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
- @@ignore_case =
{{{ Classes for Comparisons
true
Class Method Summary
collapse
Instance Method Summary
collapse
Constructor Details
#initialize(file = nil, &block) ⇒ Tokenizer
161
162
163
164
165
166
167
168
169
170
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 161
def initialize(file=nil, &block)
@types = {}
@order = []
@operations = []
@transforms = []
file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
file = file.find if file.respond_to? :find
load_config :main, file, &block
end
|
Class Method Details
.ignore_case(ignore = nil) ⇒ Object
13
14
15
16
17
18
19
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 13
def self.ignore_case(ignore = nil)
if ignore.nil?
return @@ignore_case
else
@@ignore_case = ignore
end
end
|
Instance Method Details
#define_comparisons(name, *args, &block) ⇒ Object
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 140
def define_comparisons(name, *args, &block)
o = nil
case name.to_sym
when :compare
o = Custom.new
@operations << o
when :transform
o = Transform.new
@transforms << o
else
o = Operation.new(name)
@operations << o
end
o
end
|
#define_tokens(name, *args, &block) ⇒ Object
{{{ Metaprogramming hooks
126
127
128
129
130
131
132
133
134
135
136
137
138
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 126
def define_tokens(name, *args, &block)
action = args[0] || block || /#{name.to_s}s?/i
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
@types[name.to_sym] = action
@order.push name.to_sym
name.to_sym
end
|
#evaluate(mention, name) ⇒ Object
214
215
216
217
218
219
220
221
222
223
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 214
def evaluate(mention, name)
mention_tokens, name_tokens = [mention, name].collect{|n|
token_types(n).collect{|t|
@transforms.inject(t){|t,o|
t = o.transform(t)
}
}
}
evaluate_tokens(mention_tokens, name_tokens)
end
|
#evaluate_tokens(list1, list2) ⇒ Object
208
209
210
211
212
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 208
def evaluate_tokens(list1, list2)
@operations.inject(0){|acc, o|
acc + o.eval(list1, list2)
}
end
|
#main(name, *args, &block) ⇒ Object
156
157
158
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 156
def main(name, *args, &block)
parse("define_" + name.to_s,block)
end
|
#token_types(word) ⇒ Object
200
201
202
203
204
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 200
def token_types(word)
tokenize(word).collect{|token|
[token, type(token)]
}
end
|
#tokenize(word) ⇒ Object
175
176
177
178
179
180
181
182
183
184
185
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 175
def tokenize(word)
return word.
gsub(/([^IVX])I$/,'\1|I|'). gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). gsub(/([a-z])([A-Z])/,'\1-\2').
gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
gsub(/^(#{GREEK_RE})/,'\1-').
gsub(/(#{GREEK_RE})$/,'-\1').
split( /[^\w.]+/). select{|t| !t.empty? }
end
|
#type(token) ⇒ Object
188
189
190
191
192
193
194
195
196
197
198
|
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 188
def type(token)
@order.each{|type|
action = @types[type]
if action.is_a? Proc
return type if action.call(token)
else
return type if action.match(token)
end
}
return :unknown
end
|