Class: Tokenizer

Inherits:
Object
  • Object
show all
Includes:
SimpleDSL
Defined in:
lib/rbbt/ner/rnorm/tokens.rb

Defined Under Namespace

Classes: Custom, Operation, Transform

Constant Summary collapse

GREEK_RE =

{{{ Token Types

"(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
@@ignore_case =

{{{ Classes for Comparisons

true

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file = nil, &block) ⇒ Tokenizer

{{{ Initialize



161
162
163
164
165
166
167
168
169
170
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 161

def initialize(file=nil, &block)
  @types = {}
  @order = []
  @operations = []
  @transforms = []

  file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
  file = file.find if file.respond_to? :find
  load_config :main, file, &block
end

Class Method Details

.ignore_case(ignore = nil) ⇒ Object



13
14
15
16
17
18
19
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 13

def self.ignore_case(ignore = nil)
  if ignore.nil?
    return @@ignore_case
  else
    @@ignore_case = ignore
  end
end

Instance Method Details

#define_comparisons(name, *args, &block) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 140

def define_comparisons(name, *args, &block)
   o = nil
  case name.to_sym
  when :compare
    o = Custom.new
    @operations << o
  when :transform
    o = Transform.new
    @transforms << o
  else
    o = Operation.new(name)
    @operations << o
  end
  o
end

#define_tokens(name, *args, &block) ⇒ Object

{{{ Metaprogramming hooks



126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 126

def define_tokens(name, *args, &block)
  action = args[0] || block ||  /#{name.to_s}s?/i

  #HACK: Misterious error where *args[0] returns an array [/regexp/i] for
  #example
  #action = action.first if Array === action
  raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))

  @types[name.to_sym] = action
  @order.push name.to_sym

  name.to_sym
end

#evaluate(mention, name) ⇒ Object



214
215
216
217
218
219
220
221
222
223
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 214

def evaluate(mention, name)
  mention_tokens, name_tokens = [mention, name].collect{|n|
    token_types(n).collect{|t| 
      @transforms.inject(t){|t,o| 
        t = o.transform(t)
      } 
    }
  }
  evaluate_tokens(mention_tokens, name_tokens)
end

#evaluate_tokens(list1, list2) ⇒ Object

{{{ Comparisons



208
209
210
211
212
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 208

def evaluate_tokens(list1, list2)
  @operations.inject(0){|acc, o|
    acc + o.eval(list1, list2)
  }
end

#main(name, *args, &block) ⇒ Object



156
157
158
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 156

def main(name, *args, &block)
  parse("define_" + name.to_s,block)
end

#token_types(word) ⇒ Object



200
201
202
203
204
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 200

def token_types(word)
  tokenize(word).collect{|token|
    [token, type(token)]
  }
end

#tokenize(word) ⇒ Object



175
176
177
178
179
180
181
182
183
184
185
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 175

def tokenize(word)
  return word.
    gsub(/([^IVX])I$/,'\1|I|').     # Separate last roman number
    gsub(/(\d+[,.]?\d+|\d+)/,'|\1|').     # Separate number
    gsub(/([a-z])([A-Z])/,'\1-\2').
    gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
    gsub(/^(#{GREEK_RE})/,'\1-').
    gsub(/(#{GREEK_RE})$/,'-\1').
    split( /[^\w.]+/).  # Split by separator char
    select{|t|  !t.empty? }
end

#type(token) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
# File 'lib/rbbt/ner/rnorm/tokens.rb', line 188

def type(token)
  @order.each{|type|
    action = @types[type]
    if action.is_a? Proc
      return type if action.call(token)
    else
      return type if action.match(token)
    end
  }
  return :unknown
end