Class: PosTagger::Tester

Inherits:

Object

Object
PosTagger::Tester

show all

Defined in:: lib/pos-tagger.rb

Instance Attribute Summary collapse

#sentences ⇒ Object

Returns the value of attribute sentences.

Instance Method Summary collapse

#create_tagger ⇒ Object
#initialize(path = "#{File.dirname(__FILE__)}/treebank3_sect2.txt") ⇒ Tester constructor

A new instance of Tester.
#load(path) ⇒ Object
#test_tagger(k = 10, forget = nil) ⇒ Object
#test_tagger_with_sentence(tagger, sentence) ⇒ Object

Constructor Details

#initialize(path = "#{File.dirname(FILE)}/treebank3_sect2.txt") ⇒ `Tester`

Returns a new instance of Tester.

# File 'lib/pos-tagger.rb', line 159

def initialize path="#{File.dirname(__FILE__)}/treebank3_sect2.txt"
  # Sentences are stored as array's of word-tag pairs, where each sentence
  # will be [{:word => w1, :tag => t1},...,{:word => wn, :tag => tn}].
  @sentences = []
  self.load path
end

Instance Attribute Details

#sentences ⇒ `Object`

Returns the value of attribute sentences.



157
158
159

# File 'lib/pos-tagger.rb', line 157

def sentences
  @sentences
end

Instance Method Details

#create_tagger ⇒ `Object`



188
189
190

# File 'lib/pos-tagger.rb', line 188

def create_tagger
  Tagger.new @sentences
end

#load(path) ⇒ `Object`

# File 'lib/pos-tagger.rb', line 166

def load path
  # For each sentence we split on empty space, and then use regex to split
  # each word/tag pair into its word and tag constituents. Whenever a full
  # stop is encountered we create a new sentence.
  File.open(path, "r") do |file|
    sentence  = []
    while (line = file.gets)
      line.split(' ').each do |part|
        md = /(.+)+(\/){1}(.+)+/.match part
        if md
          if md[3] == "."
            @sentences << sentence if not sentence.empty?
            sentence = []
          else
            sentence << {:word => md[1].downcase, :tag => md[3]}
          end
        end
      end
    end
  end
end

#test_tagger(k = 10, forget = nil) ⇒ `Object`

# File 'lib/pos-tagger.rb', line 192

def test_tagger k=10, forget=nil
  # This method performs k-fold validation, with the default number being 10
  # folds. We first shuffle our sentences to ensure that we do not always
  # run exactly the same test, enabling us to further repeat our k-fold
  # validation. We then create an offset value along which we make our
  # folds. The variable forget takes a fraction, which trains the tagger 
  # with all of our sentences, bigrams are calculated from 100% of the data,
  # but the fractional value of words are forgotten at random. This still 
  # tests across 10 folds. By default we leave this off to allow for proper 
  # k-fold validation.
  sentences = @sentences.shuffle
  total = sentences.length
  offset = (total.to_f*k.to_f/100).floor

  # For each fold, we divide our sentences up into test and training
  # sentences, by rotating the list by our offset amount, then partitioning
  # accordingly. We then initialise a tagger with our training set 
  # before passing in each of our test sentences for classification.
  results = (0...k).map do |i|
    print "Starting fold #{i+1}..."
    sentences = sentences.rotate offset
    test = sentences[0...offset]
    train = sentences[offset...total]
  
  
    # Here we take into account the need for test a where all the bigrams 
    # are passed in, but only a certain number of words are remembered. For
    # this we use the forget variable, which be default is nil, but if set 
    # to a fraction, i.e. 0.1, it will cause the tagger to forget 10% of 
    # its words.
    c = Tagger.new (forget ? @sentences : train), forget
  
    # For each sentence in our array of test sentences, we calculate the
    # accuracy with which its words were classified, before mapping these
    # results to a new array, which we finally take the mean of.
    percentage = test.map {|s| test_tagger_with_sentence c, s}
  
    # Here we simply print out that we've completed our fold, along with the
    # fold's accuracy. "%.2f" returns our accuracy percentage to 2.d.p.
    puts "done"
    puts "Fold #{i+1} accuracy: #{"%.2f" % (percentage.mean * 100)}%"
    percentage.mean
  end

  # Here we take the mean of each fold and print it out.
  puts "Avg. #{k}-fold accuracy: #{"%.2f" % (results.mean * 100)}%"

  # Finally return the k-fold validation's mean accuracy.
  return results.mean
end

#test_tagger_with_sentence(tagger, sentence) ⇒ `Object`

# File 'lib/pos-tagger.rb', line 243

def test_tagger_with_sentence tagger, sentence
  cs = tagger.classify sentence.map{|w| w[:word]}
  correct = cs.zip(sentence).select{|ws| ws[0][:tag] == ws[1][:tag]}.length
  correct.to_f / sentence.length
end

Class: PosTagger::Tester

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path = "#{File.dirname(__FILE__)}/treebank3_sect2.txt") ⇒ Tester

Instance Attribute Details

#sentences ⇒ Object

Instance Method Details

#create_tagger ⇒ Object

#load(path) ⇒ Object

#test_tagger(k = 10, forget = nil) ⇒ Object

#test_tagger_with_sentence(tagger, sentence) ⇒ Object

#initialize(path = "#{File.dirname(FILE)}/treebank3_sect2.txt") ⇒ `Tester`

#sentences ⇒ `Object`

#create_tagger ⇒ `Object`

#load(path) ⇒ `Object`

#test_tagger(k = 10, forget = nil) ⇒ `Object`

#test_tagger_with_sentence(tagger, sentence) ⇒ `Object`