Class: PosTagger::Tester

Inherits:
Object
  • Object
show all
Defined in:
lib/pos-tagger.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path = "#{File.dirname(__FILE__)}/treebank3_sect2.txt") ⇒ Tester

Returns a new instance of Tester.



159
160
161
162
163
164
# File 'lib/pos-tagger.rb', line 159

def initialize path="#{File.dirname(__FILE__)}/treebank3_sect2.txt"
  # Sentences are stored as array's of word-tag pairs, where each sentence
  # will be [{:word => w1, :tag => t1},...,{:word => wn, :tag => tn}].
  @sentences = []
  self.load path
end

Instance Attribute Details

#sentencesObject

Returns the value of attribute sentences.



157
158
159
# File 'lib/pos-tagger.rb', line 157

def sentences
  @sentences
end

Instance Method Details

#create_taggerObject



188
189
190
# File 'lib/pos-tagger.rb', line 188

def create_tagger
  Tagger.new @sentences
end

#load(path) ⇒ Object



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/pos-tagger.rb', line 166

def load path
  # For each sentence we split on empty space, and then use regex to split
  # each word/tag pair into its word and tag constituents. Whenever a full
  # stop is encountered we create a new sentence.
  File.open(path, "r") do |file|
    sentence  = []
    while (line = file.gets)
      line.split(' ').each do |part|
        md = /(.+)+(\/){1}(.+)+/.match part
        if md
          if md[3] == "."
            @sentences << sentence if not sentence.empty?
            sentence = []
          else
            sentence << {:word => md[1].downcase, :tag => md[3]}
          end
        end
      end
    end
  end
end

#test_tagger(k = 10, forget = nil) ⇒ Object



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/pos-tagger.rb', line 192

def test_tagger k=10, forget=nil
  # This method performs k-fold validation, with the default number being 10
  # folds. We first shuffle our sentences to ensure that we do not always
  # run exactly the same test, enabling us to further repeat our k-fold
  # validation. We then create an offset value along which we make our
  # folds. The variable forget takes a fraction, which trains the tagger 
  # with all of our sentences, bigrams are calculated from 100% of the data,
  # but the fractional value of words are forgotten at random. This still 
  # tests across 10 folds. By default we leave this off to allow for proper 
  # k-fold validation.
  sentences = @sentences.shuffle
  total = sentences.length
  offset = (total.to_f*k.to_f/100).floor

  # For each fold, we divide our sentences up into test and training
  # sentences, by rotating the list by our offset amount, then partitioning
  # accordingly. We then initialise a tagger with our training set 
  # before passing in each of our test sentences for classification.
  results = (0...k).map do |i|
    print "Starting fold #{i+1}..."
    sentences = sentences.rotate offset
    test = sentences[0...offset]
    train = sentences[offset...total]
  
  
    # Here we take into account the need for test a where all the bigrams 
    # are passed in, but only a certain number of words are remembered. For
    # this we use the forget variable, which be default is nil, but if set 
    # to a fraction, i.e. 0.1, it will cause the tagger to forget 10% of 
    # its words.
    c = Tagger.new (forget ? @sentences : train), forget
  
    # For each sentence in our array of test sentences, we calculate the
    # accuracy with which its words were classified, before mapping these
    # results to a new array, which we finally take the mean of.
    percentage = test.map {|s| test_tagger_with_sentence c, s}
  
    # Here we simply print out that we've completed our fold, along with the
    # fold's accuracy. "%.2f" returns our accuracy percentage to 2.d.p.
    puts "done"
    puts "Fold #{i+1} accuracy: #{"%.2f" % (percentage.mean * 100)}%"
    percentage.mean
  end

  # Here we take the mean of each fold and print it out.
  puts "Avg. #{k}-fold accuracy: #{"%.2f" % (results.mean * 100)}%"

  # Finally return the k-fold validation's mean accuracy.
  return results.mean
end

#test_tagger_with_sentence(tagger, sentence) ⇒ Object



243
244
245
246
247
# File 'lib/pos-tagger.rb', line 243

def test_tagger_with_sentence tagger, sentence
  cs = tagger.classify sentence.map{|w| w[:word]}
  correct = cs.zip(sentence).select{|ws| ws[0][:tag] == ws[1][:tag]}.length
  correct.to_f / sentence.length
end