Class: EngTagger

Inherits:

Object

Object
EngTagger

show all

Extended by:: BoundedSpaceMemoizable

Defined in:: lib/engtagger.rb,
lib/engtagger/version.rb

Overview

English part-of-speech tagger class

Constant Summary collapse

DEFAULT_LEXPATH = File paths

File.join(File.dirname(__FILE__), 'engtagger')

DEFAULT_WORDPATH =

File.join(DEFAULT_LEXPATH, "pos_words.hash")

DEFAULT_TAGPATH =

File.join(DEFAULT_LEXPATH, "pos_tags.hash")

NUM = Regexps to match XML-style part-of-speech tags

get_ext('cd')

GER =

get_ext('vbg')

ADJ =

get_ext('jj[rs]*')

NN =

get_ext('nn[sp]*')

NNP =

get_ext('nnp')

PREP =

get_ext('in')

DET =

get_ext('det')

PAREN =

get_ext('[lr]rb')

QUOT =

get_ext('ppr')

SEN =

get_ext('pp')

WORD =

get_ext('\w+')

VB =

get_ext('vb')

VBG =

get_ext('vbg')

VBD =

get_ext('vbd')

PART =

get_ext('vbn')

VBP =

get_ext('vbp')

VBZ =

get_ext('vbz')

JJ =

get_ext('jj')

JJR =

get_ext('jjr')

JJS =

get_ext('jjs')

RB =

get_ext('rb')

RBR =

get_ext('rbr')

RBS =

get_ext('rbs')

RP =

get_ext('rp')

WRB =

get_ext('wrb')

WDT =

get_ext('wdt')

WP =

get_ext('wp')

WPS =

get_ext('wps')

CC =

get_ext('cc')

IN =

get_ext('in')

TAGS =

VERSION =

"0.3.0"

Instance Attribute Summary collapse

#conf ⇒ Object
Hash storing config values:.

Class Method Summary collapse

.explain_tag(tag) ⇒ String
Convert a Treebank-style, abbreviated tag into verbose definitions.
.get_ext(tag = nil) ⇒ Object
Return a regexp from a string argument that matches an XML-style pos tag.
.hmm ⇒ Hash
Return a class variable that holds probability data.
.lexicon ⇒ Hash
Return a class variable that holds lexical data.

Instance Method Summary collapse

#add_tags(text, verbose = false) ⇒ String
Examine the string provided and return it fully tagged in XML style.
#get_adjectives(tagged) ⇒ Hash
The hash of matches.
#get_adverbs(tagged) ⇒ Hash
The hash of matches.
#get_base_present_verbs(tagged) ⇒ Hash
The hash of matches.
#get_comparative_adjectives(tagged) ⇒ Hash
The hash of matches.
#get_conjunctions(tagged) ⇒ Hash
Returns all types of conjunctions and does not discriminate between the various kinds.
#get_gerund_verbs(tagged) ⇒ Hash
The hash of matches.
#get_infinitive_verbs(tagged) ⇒ Hash
The hash of matches.
#get_interrogatives(tagged) ⇒ Hash (also: #get_question_parts)
The hash of matches.
#get_max_noun_phrases(tagged) ⇒ Hash
Given a POS-tagged text, this method returns only the maximal noun phrases.
#get_noun_phrases(tagged) ⇒ Hash
Similar to get_words, but requires a POS-tagged text as an argument.
#get_nouns(tagged) ⇒ Hash
Given a POS-tagged text, this method returns all nouns and their occurrence frequencies.
#get_passive_verbs(tagged) ⇒ Hash
The hash of matches.
#get_past_tense_verbs(tagged) ⇒ Hash
The hash of matches.
#get_present_verbs(tagged) ⇒ Hash
The hash of matches.
#get_proper_nouns(tagged) ⇒ Object
Given a POS-tagged text, this method returns a hash of all proper nouns and their occurrence frequencies.
#get_readable(text, verbose = false) ⇒ Object
Return an easy-on-the-eyes tagged version of a text string.
#get_sentences(text) ⇒ Object
Return an array of sentences (without POS tags) from a text.
#get_superlative_adjectives(tagged) ⇒ Hash
The hash of matches.
#get_verbs(tagged) ⇒ Hash
Returns all types of verbs and does not descriminate between the various kinds.
#get_words(text) ⇒ Object
Given a text string, return as many nouns and noun phrases as possible.
#initialize(params = {}) ⇒ EngTagger constructor
Take a hash of parameters that override default values.
#install ⇒ Object
Reads some included corpus data and saves it in a stored hash on the local file system.
#tag_pairs(text) ⇒ Array
Return an array of pairs of the form ["word", :tag].

Methods included from BoundedSpaceMemoizable

memoize

Constructor Details

#initialize(params = {}) ⇒ `EngTagger`

Take a hash of parameters that override default values. See above for details.

# File 'lib/engtagger.rb', line 193

def initialize(params = {})
  @conf = Hash.new
  @conf[:unknown_word_tag] = ''
  @conf[:stem] = false
  @conf[:weight_noun_phrases] = false
  @conf[:longest_noun_phrase] = 5
  @conf[:relax] = false
  @conf[:tag_lex] = 'tags.yml'
  @conf[:word_lex] = 'words.yml'
  @conf[:unknown_lex] = 'unknown.yml'
  @conf[:word_path] = DEFAULT_WORDPATH
  @conf[:tag_path] = DEFAULT_TAGPATH
  @conf[:debug] = false
  # assuming that we start analyzing from the beginninga new sentence...
  @conf[:current_tag] = 'pp'
  @conf.merge!(params) if params
  unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
    print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
    @@hmm = Hash.new
    @@lexicon = Hash.new
  else
    lexf = File.open(@conf[:word_path], 'r')
    @@lexicon = Marshal.load(lexf)
    lexf.close
    hmmf = File.open(@conf[:tag_path], 'r')
    @@hmm = Marshal.load(hmmf)
    hmmf.close
  end
  @@mnp = get_max_noun_regex
end

Instance Attribute Details

#conf ⇒ `Object`

Hash storing config values:

:unknown_word_tag => (String) Tag to assign to unknown words
:stem => (Boolean) Stem single words using Porter module
:weight_noun_phrases => (Boolean) When returning occurrence counts for a noun phrase, multiply the valuethe number of words in the NP.
:longest_noun_phrase => (Integer) Will ignore noun phrases longer than this threshold. This affects only the get_words() and get_nouns() methods.
:relax => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for uncommon words, particularly words used polysemously
:tag_lex => (String) Name of the YAML file containing a hash of adjacent part of speech tags and the probability of each
:word_lex => (String) Name of the YAML file containing a hash of words and corresponding parts of speech
:unknown_lex => (String) Name of the YAML file containing a hash of tags for unknown words and corresponding parts of speech
:tag_path => (String) Directory path of tag_lex
:word_path => (String) Directory path of word_lex and unknown_lex
:debug => (Boolean) Print debug messages



185
186
187

# File 'lib/engtagger.rb', line 185

def conf
  @conf
end

Class Method Details

.explain_tag(tag) ⇒ `String`

Convert a Treebank-style, abbreviated tag into verbose definitions

Parameters:

tag (#to_s) —
the tag in question

Returns:

(String) —
the definition, if available

# File 'lib/engtagger.rb', line 94

def self.explain_tag(tag)
  tag = tag.to_s.downcase
  if TAGS[tag]
    return TAGS[tag]
  else
    return tag
  end
end

.get_ext(tag = nil) ⇒ `Object`

Return a regexp from a string argument that matches an XML-style pos tag

# File 'lib/engtagger.rb', line 52

def self.get_ext(tag = nil)
  return nil unless tag
  return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
end

.hmm ⇒ `Hash`

Return a class variable that holds probability data.

Returns:

(Hash) —
the probability data



39
40
41

# File 'lib/engtagger.rb', line 39

def self.hmm
  return @@hmm
end

.lexicon ⇒ `Hash`

Return a class variable that holds lexical data.

Returns:

(Hash) —
the lexicon



47
48
49

# File 'lib/engtagger.rb', line 47

def self.lexicon
  return @@lexicon
end

Instance Method Details

#add_tags(text, verbose = false) ⇒ `String`

Examine the string provided and return it fully tagged in XML style.

Examine the string provided and return it fully tagged in XML style

Parameters:

text (String) —
the input text
verbose (false, true) (defaults to: false) —
whether to use verbose tags

Returns:

(String) —
the marked-up string

# File 'lib/engtagger.rb', line 256

def add_tags(text, verbose = false)
  return nil unless valid_text(text)
  tagged = []
  words = clean_text(text)
  tags = Array.new
  words.each do |word|
    cleaned_word = clean_word(word)
    tag = assign_tag(@conf[:current_tag], cleaned_word)
    @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
    tag = EngTagger.explain_tag(tag) if verbose
    tagged << '<' + tag + '>' + word + '</' + tag + '>'
  end
  reset
  return tagged.join(' ')
end

#get_adjectives(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 441

def get_adjectives(tagged)
  return nil unless valid_text(tagged)
  tags = [JJ]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_adverbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 471

def get_adverbs(tagged)
  return nil unless valid_text(tagged)
  tags = [RB, RBR, RBS, RP]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_base_present_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 421

def get_base_present_verbs(tagged)
  return nil unless valid_text(tagged)
  tags = [VBP]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_comparative_adjectives(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 451

def get_comparative_adjectives(tagged)
  return nil unless valid_text(tagged)
  tags = [JJR]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_conjunctions(tagged) ⇒ `Hash`

Returns all types of conjunctions and does not discriminate between the various kinds. E.g. coordinating, subordinating, correlative...

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 498

def get_conjunctions(tagged)
  return nil unless valid_text(tagged)
  tags = [CC, IN]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_gerund_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 401

def get_gerund_verbs(tagged)
  return nil unless valid_text(tagged)
  tags = [VBG]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_infinitive_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 381

def get_infinitive_verbs(tagged)
  return nil unless valid_text(tagged)
  tags = [VB]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_interrogatives(tagged) ⇒ `Hash` Also known as: get_question_parts

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 481

def get_interrogatives(tagged)
  return nil unless valid_text(tagged)
  tags = [WRB, WDT, WP, WPS]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_max_noun_phrases(tagged) ⇒ `Hash`

Given a POS-tagged text, this method returns only the maximal noun phrases. May be called directly, but is also used by get_noun_phrases.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 510

def get_max_noun_phrases(tagged)
  return nil unless valid_text(tagged)
  tags = [@@mnp]
  mn_phrases = build_trimmed(tagged, tags)
  ret = Hash.new(0)
  mn_phrases.each do |p|
    p = stem(p) unless p =~ /\s/  # stem single words
    ret[p] += 1 unless p =~ /\A\s*\z/
  end
  return ret
end

#get_noun_phrases(tagged) ⇒ `Hash`

Similar to get_words, but requires a POS-tagged text as an argument.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 527

def get_noun_phrases(tagged)
  return nil unless valid_text(tagged)
  found = Hash.new(0)
  phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
    scanned = tagged.scan(@@mnp)
  # Find MNPs in the text, one sentence at a time
  # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
  mn_phrases = []
  scanned.each do |m|
    found[m] += 1 if phrase_ext =~ m
    mn_phrases += m.split(phrase_ext)
  end
  mn_phrases.each do |mnp|
    # Split the phrase into an array of words, and create a loop for each word,
    # shortening the phrase by removing the word in the first position.
    # Record the phrase and any single nouns that are found
    words = mnp.split
    words.length.times do |i|
      found[words.join(' ')] += 1 if words.length > 1
      w = words.shift
      found[w] += 1 if w =~ /#{NN}/
    end
  end
  ret = Hash.new(0)
  found.keys.each do |f|
    k = strip_tags(f)
    v = found[f]
    # We weight by the word count to favor long noun phrases
    space_count = k.scan(/\s+/)
    word_count = space_count.length + 1
    # Throttle MNPs if necessary
    next if word_count > @conf[:longest_noun_phrase]
    k = stem(k) unless word_count > 1  # stem single words
    multiplier = 1
    multiplier = word_count if @conf[:weight_noun_phrases]
    ret[k] += multiplier * v
  end
  return ret
end

#get_nouns(tagged) ⇒ `Hash`

Given a POS-tagged text, this method returns all nouns and their occurrence frequencies.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 357

def get_nouns(tagged)
  return nil unless valid_text(tagged)
  tags = [NN]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_passive_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 411

def get_passive_verbs(tagged)
  return nil unless valid_text(tagged)
  tags = [PART]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_past_tense_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 391

def get_past_tense_verbs(tagged)
  return nil unless valid_text(tagged)
  tags = [VBD]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_present_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 431

def get_present_verbs(tagged)
  return nil unless valid_text(tagged)
  tags = [VBZ]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_proper_nouns(tagged) ⇒ `Object`

Given a POS-tagged text, this method returns a hash of all proper nouns and their occurrence frequencies. The method is greedy and will return multi-word phrases, if possible, so it would find ``Linguistic Data Consortium'' as a single unit, rather than as three individual proper nouns. This method does not stem the found words.

# File 'lib/engtagger.rb', line 323

def get_proper_nouns(tagged)
  return nil unless valid_text(tagged)
  tags = [NNP]
  nnp = build_matches_hash(build_trimmed(tagged, tags))
  # Now for some fancy resolution stuff...
  nnp.keys.each do |key|
    words = key.split(/\s/)
    # Let's say this is an organization's name --
    # (and it's got at least three words)
    # is there a corresponding acronym in this hash?
    if words.length > 2
      # Make a (naive) acronym out of this name
      acronym = words.map do |word|
        /\A([a-z])[a-z]*\z/ =~ word
        $1
      end.join ''
      # If that acronym has been seen,
      # remove it and add the values to
      # the full name
      if nnp[acronym]
        nnp[key] += nnp[acronym]
        nnp.delete(acronym)
      end
    end
  end
  return nnp
end

#get_readable(text, verbose = false) ⇒ `Object`

Return an easy-on-the-eyes tagged version of a text string. Applies add_tags and reformats to be easier to read.

# File 'lib/engtagger.rb', line 291

def get_readable(text, verbose = false)
  return nil unless valid_text(text)
  tagged = add_tags(text, verbose)
  tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
  #!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
    $1 + '/' + $2.upcase
  end
end

#get_sentences(text) ⇒ `Object`

Return an array of sentences (without POS tags) from a text.

# File 'lib/engtagger.rb', line 301

def get_sentences(text)
  return nil unless valid_text(text)
  tagged = add_tags(text)
  sentences = Array.new
  tagged.split(/<\/pp>/).each do |line|
    sentences << strip_tags(line)
  end
  sentences = sentences.map do |sentence|
    sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
    sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
    sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
    sentence.gsub(Regexp.new(" (\W+)$")){$1}
    sentence.gsub(Regexp.new("^(`+) ")){$1}
  end
  return sentences
end

#get_superlative_adjectives(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 461

def get_superlative_adjectives(tagged)
  return nil unless valid_text(tagged)
  tags = [JJS]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_verbs(tagged) ⇒ `Hash`

Returns all types of verbs and does not descriminate between the various kinds. Combines all other verb methods listed in this class.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 370

def get_verbs(tagged)
  return nil unless valid_text(tagged)
  tags = [VB, VBD, VBG, PART, VBP, VBZ]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_words(text) ⇒ `Object`

Given a text string, return as many nouns and noun phrases as possible. Applies add_tags and involves three stages:

Tag the text
Extract all the maximal noun phrases
Recursively extract all noun phrases from the MNPs

# File 'lib/engtagger.rb', line 279

def get_words(text)
  return false unless valid_text(text)
  tagged = add_tags(text)
  if(@conf[:longest_noun_phrase] <= 1)
    return get_nouns(tagged)
  else
    return get_noun_phrases(tagged)
  end
end

#install ⇒ `Object`

Reads some included corpus data and saves it in a stored hash on the local file system. This is called automatically if the tagger can't find the stored lexicon.

# File 'lib/engtagger.rb', line 570

def install
  puts "Creating part-of-speech lexicon" if @conf[:debug]
  load_tags(@conf[:tag_lex])
  load_words(@conf[:word_lex])
  load_words(@conf[:unknown_lex])
  File.open(@conf[:word_path], 'w') do |f|
    Marshal.dump(@@lexicon, f)
  end
  File.open(@conf[:tag_path], 'w') do |f|
    Marshal.dump(@@hmm, f)
  end
end

#tag_pairs(text) ⇒ `Array`

Return an array of pairs of the form ["word", :tag].

Parameters:

text (String) —
the input text

Returns:

(Array) —
the tagged words

# File 'lib/engtagger.rb', line 233

def tag_pairs(text)
  return [] unless valid_text(text)

  out = clean_text(text).map do |word|
    cleaned_word = clean_word word
    tag = assign_tag(@conf[:current_tag], cleaned_word)
    @conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
    [word, tag.to_sym]
  end

  # reset the tagger state
  reset

  out
end

Class: EngTagger

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from BoundedSpaceMemoizable

Constructor Details

#initialize(params = {}) ⇒ EngTagger

Instance Attribute Details

#conf ⇒ Object

Class Method Details

.explain_tag(tag) ⇒ String

.get_ext(tag = nil) ⇒ Object

.hmm ⇒ Hash

.lexicon ⇒ Hash

Instance Method Details

#add_tags(text, verbose = false) ⇒ String

#get_adjectives(tagged) ⇒ Hash

#get_adverbs(tagged) ⇒ Hash

#get_base_present_verbs(tagged) ⇒ Hash

#get_comparative_adjectives(tagged) ⇒ Hash

#get_conjunctions(tagged) ⇒ Hash

#get_gerund_verbs(tagged) ⇒ Hash

#get_infinitive_verbs(tagged) ⇒ Hash

#get_interrogatives(tagged) ⇒ Hash Also known as: get_question_parts

#get_max_noun_phrases(tagged) ⇒ Hash

#get_noun_phrases(tagged) ⇒ Hash

#get_nouns(tagged) ⇒ Hash

#get_passive_verbs(tagged) ⇒ Hash

#get_past_tense_verbs(tagged) ⇒ Hash

#get_present_verbs(tagged) ⇒ Hash

#get_proper_nouns(tagged) ⇒ Object

#get_readable(text, verbose = false) ⇒ Object

#get_sentences(text) ⇒ Object

#get_superlative_adjectives(tagged) ⇒ Hash

#get_verbs(tagged) ⇒ Hash

#get_words(text) ⇒ Object

#install ⇒ Object

#tag_pairs(text) ⇒ Array

#initialize(params = {}) ⇒ `EngTagger`

#conf ⇒ `Object`

.explain_tag(tag) ⇒ `String`

.get_ext(tag = nil) ⇒ `Object`

.hmm ⇒ `Hash`

.lexicon ⇒ `Hash`

#add_tags(text, verbose = false) ⇒ `String`

#get_adjectives(tagged) ⇒ `Hash`

#get_adverbs(tagged) ⇒ `Hash`

#get_base_present_verbs(tagged) ⇒ `Hash`

#get_comparative_adjectives(tagged) ⇒ `Hash`

#get_conjunctions(tagged) ⇒ `Hash`

#get_gerund_verbs(tagged) ⇒ `Hash`

#get_infinitive_verbs(tagged) ⇒ `Hash`

#get_interrogatives(tagged) ⇒ `Hash` Also known as: get_question_parts

#get_max_noun_phrases(tagged) ⇒ `Hash`

#get_noun_phrases(tagged) ⇒ `Hash`

#get_nouns(tagged) ⇒ `Hash`

#get_passive_verbs(tagged) ⇒ `Hash`

#get_past_tense_verbs(tagged) ⇒ `Hash`

#get_present_verbs(tagged) ⇒ `Hash`

#get_proper_nouns(tagged) ⇒ `Object`

#get_readable(text, verbose = false) ⇒ `Object`

#get_sentences(text) ⇒ `Object`

#get_superlative_adjectives(tagged) ⇒ `Hash`

#get_verbs(tagged) ⇒ `Hash`

#get_words(text) ⇒ `Object`

#install ⇒ `Object`

#tag_pairs(text) ⇒ `Array`