Class: Anystyle::Parser::Parser

Inherits:

Object

Object
Anystyle::Parser::Parser

show all

Defined in:: lib/anystyle/parser/parser.rb

Class Attribute Summary collapse

.defaults ⇒ Object readonly

Returns the value of attribute defaults.
.feature ⇒ Object readonly

Returns the value of attribute feature.
.features ⇒ Object readonly

Returns the value of attribute features.
.formats ⇒ Object readonly

Returns the value of attribute formats.
.models ⇒ Object readonly

Returns the value of attribute models.

Instance Attribute Summary collapse

#model ⇒ Object

Returns the value of attribute model.
#normalizer ⇒ Object

Returns the value of attribute normalizer.
#options ⇒ Object readonly

Returns the value of attribute options.

Class Method Summary collapse

.instance ⇒ Object

Returns a default parser instance.
.load(path) ⇒ Object

Instance Method Summary collapse

#classify(hash) ⇒ Object
#expand(token, sequence = [], offset = 0, label = nil) ⇒ Object

Expands the passed-in token string by appending a space separated list of all features for the token.
#initialize(options = {}) ⇒ Parser constructor

A new instance of Parser.
#label(input, labelled = false) ⇒ Object

Returns an array of label/segment pairs for each line in the passed-in string.
#normalize(hash) ⇒ Object
#parse(input, format = ) ⇒ Object
#prepare(input, tagged = false) ⇒ Object

Prepares the passed-in string for processing by a CRF tagger.
#test(input) ⇒ Object
#tokenize(string, tagged = false) ⇒ Object

Returns an array of tokens for each line of input.
#train(input, truncate = false) ⇒ Object

Constructor Details

#initialize(options = {}) ⇒ `Parser`

Returns a new instance of Parser.

# File 'lib/anystyle/parser/parser.rb', line 46

def initialize(options = {})
	@options = Parser.defaults.merge(options)
	@model = Wapiti.load(Parser.models[@options[:model]])
	@normalizer = Normalizer.instance
end

Class Attribute Details

.defaults ⇒ `Object` (readonly)

Returns the value of attribute defaults.



27
28
29

# File 'lib/anystyle/parser/parser.rb', line 27

def defaults
  @defaults
end

.feature ⇒ `Object` (readonly)

Returns the value of attribute feature.



27
28
29

# File 'lib/anystyle/parser/parser.rb', line 27

def feature
  @feature
end

.features ⇒ `Object` (readonly)

Returns the value of attribute features.



27
28
29

# File 'lib/anystyle/parser/parser.rb', line 27

def features
  @features
end

.formats ⇒ `Object` (readonly)

Returns the value of attribute formats.



27
28
29

# File 'lib/anystyle/parser/parser.rb', line 27

def formats
  @formats
end

.models ⇒ `Object` (readonly)

Returns the value of attribute models.



27
28
29

# File 'lib/anystyle/parser/parser.rb', line 27

def models
  @models
end

Instance Attribute Details

#model ⇒ `Object`

Returns the value of attribute model.



44
45
46

# File 'lib/anystyle/parser/parser.rb', line 44

def model
  @model
end

#normalizer ⇒ `Object`

Returns the value of attribute normalizer.



44
45
46

# File 'lib/anystyle/parser/parser.rb', line 44

def normalizer
  @normalizer
end

#options ⇒ `Object` (readonly)

Returns the value of attribute options.



42
43
44

# File 'lib/anystyle/parser/parser.rb', line 42

def options
  @options
end

Class Method Details

.instance ⇒ `Object`

Returns a default parser instance



36
37
38

# File 'lib/anystyle/parser/parser.rb', line 36

def instance
	@instance ||= new
end

.load(path) ⇒ `Object`

# File 'lib/anystyle/parser/parser.rb', line 29

def load(path)
	p = new                                    
	p.model = Wapiti.load(path)
	p
end

Instance Method Details

#classify(hash) ⇒ `Object`

# File 'lib/anystyle/parser/parser.rb', line 155

def classify(hash)
	return hash if hash.has_key?(:type)
	
	keys = hash.keys
	text = hash.values.flatten.join
	
	case
	when keys.include?(:journal)
		hash[:type] = :article
	when text =~ /proceedings/i
		hash[:type] = :inproceedings
	when keys.include?(:booktitle), keys.include?(:container)
		hash[:type] = :incollection
	when keys.include?(:publisher)
		hash[:type] = :book
	when keys.include?(:institution)
		hash[:type] = :techreport
	when keys.include?(:school)
		hash[:type] = :mastersthesis
	when text =~ /unpublished/i
		hash[:type] = :unpublished
	else
		hash[:type] = :misc
	end
	
	hash
end

#expand(token, sequence = [], offset = 0, label = nil) ⇒ `Object`

Expands the passed-in token string by appending a space separated list of all features for the token.

# File 'lib/anystyle/parser/parser.rb', line 126

def expand(token, sequence = [], offset = 0, label = nil)
	f = features_for(token, strip(token), sequence, offset)
	f.unshift(token)
	f.push(label) unless label.nil?
	f.join(' ')
end

#label(input, labelled = false) ⇒ `Object`

Returns an array of label/segment pairs for each line in the passed-in string.

# File 'lib/anystyle/parser/parser.rb', line 60

def label(input, labelled = false)
	string = input_to_s(input)
	
	model.label(prepare(string, labelled)).map! do |sequence|
		sequence.inject([]) do |ts, (token, label)|
			token, label = token[/^\S+/], label.to_sym
			if (prev = ts[-1]) && prev[0] == label
				prev[1] << ' ' << token
				ts
			else
				ts << [label, token]
			end
		end
	end
	
end

#normalize(hash) ⇒ `Object`

# File 'lib/anystyle/parser/parser.rb', line 148

def normalize(hash)
	hash.keys.each do |label|
		normalizer.send("normalize_#{label}", hash)
	end
	classify hash
end

#parse(input, format = ) ⇒ `Object`

# File 'lib/anystyle/parser/parser.rb', line 52

def parse(input, format = options[:format])
	formatter = "format_#{format}".to_sym
	send(formatter, label(input))
rescue NoMethodError
	raise ArgumentError, "format not supported: #{formatter}"
end

#prepare(input, tagged = false) ⇒ `Object`

Prepares the passed-in string for processing by a CRF tagger. The string is split into separate lines; each line is tokenized and expanded. Returns an array of sequence arrays that can be labelled by the CRF model.

If the string is marked as being tagged by passing true as the second argument, training labels will be extracted from the string and appended after feature expansion. The returned sequence arrays can be used for training or testing the CRF model.

# File 'lib/anystyle/parser/parser.rb', line 118

def prepare(input, tagged = false)
	string = input_to_s(input)
	tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
end

#test(input) ⇒ `Object`

# File 'lib/anystyle/parser/parser.rb', line 142

def test(input)
	string = input_to_s(input)
	model.options.check!
	model.label(prepare(string, true))
end

#tokenize(string, tagged = false) ⇒ `Object`

Returns an array of tokens for each line of input.

If the passed-in string is marked as being tagged, extracts labels from the string and returns an array of token/label pairs for each line of input.

# File 'lib/anystyle/parser/parser.rb', line 82

def tokenize(string, tagged = false)
	if tagged
		string.split(/[\n\r]+/).each_with_index.map do |s,i|
			tt, tokens, tags = s.split(options[:tagged_separator]), [], []

			tt.each do |token|
				case token
				when /^$/
					# skip
				when /^<([^\/>][^>]*)>$/
					tags << $1
				when /^<\/([^>]+)>$/
					unless (tag = tags.pop) == $1
						raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
					end
				else
					tokens << [token, (tags[-1] || :unknown).to_sym]
				end
			end

			tokens
		end
	else
		string.split(/[\n\r]+/).map { |s| s.split(options[:separator]) }
	end
end

#train(input, truncate = false) ⇒ `Object`

# File 'lib/anystyle/parser/parser.rb', line 133

def train(input, truncate = false)
	string = input_to_s(input)
	@model = Wapiti::Model.new(:pattern => options[:pattern]) if truncate
	@model.train(prepare(string, true))
	@model.compact
	@model.path = Parser.models[options[:model]]
	@model
end

Class: Anystyle::Parser::Parser

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Parser

Class Attribute Details

.defaults ⇒ Object (readonly)

.feature ⇒ Object (readonly)

.features ⇒ Object (readonly)

.formats ⇒ Object (readonly)

.models ⇒ Object (readonly)

Instance Attribute Details

#model ⇒ Object

#normalizer ⇒ Object

#options ⇒ Object (readonly)

Class Method Details

.instance ⇒ Object

.load(path) ⇒ Object