Class: NGramPrefixDictionary

Inherits:
NER
  • Object
show all
Defined in:
lib/rbbt/ner/ngram_prefix_dictionary.rb

Overview

This code was adapted from Ashish Tendulkar (ASK MARTIN)

Constant Summary collapse

STOP_LETTERS =
%w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
STOP_LETTER_CHAR_VALUES =
STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}
LETTER_REGEXP =
Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from NER

#entities

Constructor Details

#initialize(file, type = nil, case_insensitive = false) ⇒ NGramPrefixDictionary

Returns a new instance of NGramPrefixDictionary.



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 128

def initialize(file, type = nil, case_insensitive = false)
  @type = type
  @case_insensitive = case_insensitive
  case
  when (TSV === file or Hash === file)
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
    @index = NGramPrefixDictionary.process_hash(file, case_insensitive)
  when Path === file
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
    @index = NGramPrefixDictionary.process_stream(file.open, case_insensitive)
  when Misc.is_filename?(file)
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
    @index = NGramPrefixDictionary.process_stream(Open.open(file))
  when StreamIO === file
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
    @index = NGramPrefixDictionary.process_stream(file, case_insensitive)
  else
    raise "Format of lexicon not understood: #{file.inspect}"
  end

  Log.debug("Ngram Prefix Dictionary. Loading done.")
end

Instance Attribute Details

#case_insensitiveObject

Returns the value of attribute case_insensitive.



127
128
129
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127

def case_insensitive
  @case_insensitive
end

#indexObject

Returns the value of attribute index.



127
128
129
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127

def index
  @index
end

#typeObject

Returns the value of attribute type.



127
128
129
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127

def type
  @type
end

Class Method Details

.match(index, text) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 84

def self.match(index, text)
  return [] if text.nil? or text.empty?

  matches = []

  text_offset = 0
  text_chars = text.chars.to_a
  text_length = text.length
  while (not text_offset.nil?) and text_offset < text_length
    if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
      text_offset += 1 
      next
    end
    ngram =  text.slice(text_offset, 3).strip
    text_byte_offset = text_offset == 0 ? 0 : text[0..text_offset-1].bytesize

    found = nil
    if index.include? ngram
      diff = text_length - text_offset
      # Match with entries
      index[ngram].each do |name, code|
        if name.length <= diff
          if fast_start_with(text, name, text_byte_offset)
            found = [name.dup, code, text_offset]
            break
          end
        end
      end
    end

    if found.nil?
      text_offset = text.index(LETTER_REGEXP, text_offset)
      text_offset += 1 unless text_offset.nil?
    else
      matches << found
      text_offset += found.first.length
    end
  end

  matches
end

.process_hash(hash, case_insensitive = false) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 64

def self.process_hash(hash, case_insensitive = false)
  index = {}

  hash.monitor = true if hash.respond_to? :monitor
  hash.unnamed = true if hash.respond_to? :unnamed
  method = hash.respond_to?(:through)? :through : :each

  hash.send(method) do |code, names|
    names.each do |name|
      name = name.downcase if case_insensitive
      ngram = name[0..2].strip
      index[ngram] ||= []
      index[ngram] << [name, code]
    end
  end

  index
end

.process_stream(stream, case_insensitive = false) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 46

def self.process_stream(stream, case_insensitive = false)
  index = {}

  while line = stream.gets
    names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
    code = names.shift
    
    names.each do |name|
      name = name.downcase if case_insensitive
      ngram = name[0..2].strip
      index[ngram] ||= []
      index[ngram] << [name, code]
    end
  end

  index
end

Instance Method Details

#match(text) ⇒ Object



151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 151

def match(text)
  matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
    NamedEntity.setup(name, offset, type, code)
  }

  if case_insensitive
    matches.each{|m| m.replace(text[m.range])}
    matches
  else
    matches
  end
end