Class: Anystyle::Parser::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/anystyle/parser/parser.rb

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Parser

Returns a new instance of Parser.



46
47
48
49
50
# File 'lib/anystyle/parser/parser.rb', line 46

def initialize(options = {})
	@options = Parser.defaults.merge(options)
	@model = Wapiti.load(Parser.models[@options[:model]])
	@normalizer = Normalizer.instance
end

Class Attribute Details

.defaultsObject (readonly)

Returns the value of attribute defaults.



27
28
29
# File 'lib/anystyle/parser/parser.rb', line 27

def defaults
  @defaults
end

.featureObject (readonly)

Returns the value of attribute feature.



27
28
29
# File 'lib/anystyle/parser/parser.rb', line 27

def feature
  @feature
end

.featuresObject (readonly)

Returns the value of attribute features.



27
28
29
# File 'lib/anystyle/parser/parser.rb', line 27

def features
  @features
end

.formatsObject (readonly)

Returns the value of attribute formats.



27
28
29
# File 'lib/anystyle/parser/parser.rb', line 27

def formats
  @formats
end

.modelsObject (readonly)

Returns the value of attribute models.



27
28
29
# File 'lib/anystyle/parser/parser.rb', line 27

def models
  @models
end

Instance Attribute Details

#modelObject

Returns the value of attribute model.



44
45
46
# File 'lib/anystyle/parser/parser.rb', line 44

def model
  @model
end

#normalizerObject

Returns the value of attribute normalizer.



44
45
46
# File 'lib/anystyle/parser/parser.rb', line 44

def normalizer
  @normalizer
end

#optionsObject (readonly)

Returns the value of attribute options.



42
43
44
# File 'lib/anystyle/parser/parser.rb', line 42

def options
  @options
end

Class Method Details

.instanceObject

Returns a default parser instance



36
37
38
# File 'lib/anystyle/parser/parser.rb', line 36

def instance
	@instance ||= new
end

.load(path) ⇒ Object



29
30
31
32
33
# File 'lib/anystyle/parser/parser.rb', line 29

def load(path)
	p = new                                    
	p.model = Wapiti.load(path)
	p
end

Instance Method Details

#classify(hash) ⇒ Object



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/anystyle/parser/parser.rb', line 155

def classify(hash)
	return hash if hash.has_key?(:type)
	
	keys = hash.keys
	text = hash.values.flatten.join
	
	case
	when keys.include?(:journal)
		hash[:type] = :article
	when text =~ /proceedings/i
		hash[:type] = :inproceedings
	when keys.include?(:booktitle), keys.include?(:container)
		hash[:type] = :incollection
	when keys.include?(:publisher)
		hash[:type] = :book
	when keys.include?(:institution)
		hash[:type] = :techreport
	when keys.include?(:school)
		hash[:type] = :mastersthesis
	when text =~ /unpublished/i
		hash[:type] = :unpublished
	else
		hash[:type] = :misc
	end
	
	hash
end

#expand(token, sequence = [], offset = 0, label = nil) ⇒ Object

Expands the passed-in token string by appending a space separated list of all features for the token.



126
127
128
129
130
131
# File 'lib/anystyle/parser/parser.rb', line 126

def expand(token, sequence = [], offset = 0, label = nil)
	f = features_for(token, strip(token), sequence, offset)
	f.unshift(token)
	f.push(label) unless label.nil?
	f.join(' ')
end

#label(input, labelled = false) ⇒ Object

Returns an array of label/segment pairs for each line in the passed-in string.



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/anystyle/parser/parser.rb', line 60

def label(input, labelled = false)
	string = input_to_s(input)
	
	model.label(prepare(string, labelled)).map! do |sequence|
		sequence.inject([]) do |ts, (token, label)|
			token, label = token[/^\S+/], label.to_sym
			if (prev = ts[-1]) && prev[0] == label
				prev[1] << ' ' << token
				ts
			else
				ts << [label, token]
			end
		end
	end
	
end

#normalize(hash) ⇒ Object



148
149
150
151
152
153
# File 'lib/anystyle/parser/parser.rb', line 148

def normalize(hash)
	hash.keys.each do |label|
		normalizer.send("normalize_#{label}", hash)
	end
	classify hash
end

#parse(input, format = ) ⇒ Object



52
53
54
55
56
57
# File 'lib/anystyle/parser/parser.rb', line 52

def parse(input, format = options[:format])
	formatter = "format_#{format}".to_sym
	send(formatter, label(input))
rescue NoMethodError
	raise ArgumentError, "format not supported: #{formatter}"
end

#prepare(input, tagged = false) ⇒ Object

Prepares the passed-in string for processing by a CRF tagger. The string is split into separate lines; each line is tokenized and expanded. Returns an array of sequence arrays that can be labelled by the CRF model.

If the string is marked as being tagged by passing true as the second argument, training labels will be extracted from the string and appended after feature expansion. The returned sequence arrays can be used for training or testing the CRF model.



118
119
120
121
# File 'lib/anystyle/parser/parser.rb', line 118

def prepare(input, tagged = false)
	string = input_to_s(input)
	tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
end

#test(input) ⇒ Object



142
143
144
145
146
# File 'lib/anystyle/parser/parser.rb', line 142

def test(input)
	string = input_to_s(input)
	model.options.check!
	model.label(prepare(string, true))
end

#tokenize(string, tagged = false) ⇒ Object

Returns an array of tokens for each line of input.

If the passed-in string is marked as being tagged, extracts labels from the string and returns an array of token/label pairs for each line of input.



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/anystyle/parser/parser.rb', line 82

def tokenize(string, tagged = false)
	if tagged
		string.split(/[\n\r]+/).each_with_index.map do |s,i|
			tt, tokens, tags = s.split(options[:tagged_separator]), [], []

			tt.each do |token|
				case token
				when /^$/
					# skip
				when /^<([^\/>][^>]*)>$/
					tags << $1
				when /^<\/([^>]+)>$/
					unless (tag = tags.pop) == $1
						raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
					end
				else
					tokens << [token, (tags[-1] || :unknown).to_sym]
				end
			end

			tokens
		end
	else
		string.split(/[\n\r]+/).map { |s| s.split(options[:separator]) }
	end
end

#train(input, truncate = false) ⇒ Object



133
134
135
136
137
138
139
140
# File 'lib/anystyle/parser/parser.rb', line 133

def train(input, truncate = false)
	string = input_to_s(input)
	@model = Wapiti::Model.new(:pattern => options[:pattern]) if truncate
	@model.train(prepare(string, true))
	@model.compact
	@model.path = Parser.models[options[:model]]
	@model
end