Class: Excite::CRFParser
Constant Summary
collapse
- DIR =
File.dirname(__FILE__)
- TAGGED_REFERENCES =
"#{DIR}/resources/trainingdata/tagged_references.txt"
- TAGGED_HTML_REFERENCES =
"#{DIR}/resources/trainingdata/tagged_html_references.txt"
- TRAINING_DATA =
"#{DIR}/resources/trainingdata/training_data.txt"
- MODEL_FILE =
"#{DIR}/resources/model"
- HTML_MODEL_FILE =
"#{DIR}/resources/html_model"
- TEMPLATE_FILE =
"#{DIR}/resources/parsCit.template"
- HTML_TEMPLATE_FILE =
"#{DIR}/resources/html.template"
- CONFIG_FILE =
"#{DIR}/../../config/parscit_features.yml"
Preprocessor::CLEANUP_RULES_FILE, Preprocessor::MARKER_TYPES
TokenFeatures::DICT, TokenFeatures::NODE_TYPES_BY_NAME
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
-
#default_model_file ⇒ Object
-
#default_tagged_references ⇒ Object
-
#default_template_file ⇒ Object
-
#eval_crfpp(feat_seq, model) ⇒ Object
-
#html_str_2_tokens(str) ⇒ Object
-
#html_text_node_2_tokens(node) ⇒ Object
-
#initialize(mode = :string) ⇒ CRFParser
constructor
Feature functions must be performed in alphabetical order, since later functions may depend on earlier ones.
-
#model ⇒ Object
-
#normalize_input_author(str) ⇒ Object
-
#parse(str, presumed_author = nil) ⇒ Object
-
#prepare_token_data(raw_string, training = false) ⇒ Object
-
#recognized_labels ⇒ Object
-
#str_2_features(raw_string, training = false, presumed_author = nil) ⇒ Object
calculate features on the full citation string.
-
#str_2_tokens(str) ⇒ Object
-
#tagged_string_2_tags(str) ⇒ Object
-
#tagger ⇒ Object
-
#text_str_2_tokens(text) ⇒ Object
-
#train(tagged_refs = nil, model = nil, template = nil, training_data = nil) ⇒ Object
-
#write_training_file(tagged_refs = nil, training_data = TRAINING_DATA) ⇒ Object
#join_multi_word_names, #method_missing, #normalize, #normalize_author, #normalize_author_name, #normalize_date, #normalize_fields, #normalize_pages, #normalize_title, #normalize_volume, #pairable_quote_chars, #repair_and_tokenize_author_text
#cleanup_rules, #normalize_citation, #normalize_cite_text, #segment_citations, #split_citations_by_marker
#capitalization, #clear, #dict_status, #firstName, #first_1_char, #first_2_chars, #first_3_chars, #first_4_chars, #first_5_chars, #is_in, #lastName, #last_1_char, #last_2_chars, #last_3_chars, #last_4_chars, #last_char, #location, #location_in_node, #monthName, #numbers, #part_of_speech, #placeName, #possible_chapter, #possible_editor, #possible_volume, #publisherName, #punct, #tag_name, #toklcnp
Constructor Details
#initialize(mode = :string) ⇒ CRFParser
Feature functions must be performed in alphabetical order, since later functions may depend on earlier ones. TODO This seems pretty confusing and dependent on the current features.
32
33
34
35
36
37
38
39
|
# File 'lib/excite/crfparser.rb', line 32
def initialize(mode=:string)
@mode = mode
f = File.open(CONFIG_FILE, 'r')
hsh = YAML::load(f)[mode.to_s]
@feature_order = hsh["feature_order"].map(&:to_sym)
@token_features = hsh["feature_order"].sort.map(&:to_sym)
end
|
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
in the class Excite::Postprocessor
Instance Attribute Details
#feature_order ⇒ Object
Returns the value of attribute feature_order.
12
13
14
|
# File 'lib/excite/crfparser.rb', line 12
def feature_order
@feature_order
end
|
#token_features ⇒ Object
Returns the value of attribute token_features.
13
14
15
|
# File 'lib/excite/crfparser.rb', line 13
def token_features
@token_features
end
|
Class Method Details
.strip_punct(str) ⇒ Object
77
78
79
|
# File 'lib/excite/crfparser.rb', line 77
def self.strip_punct(str)
str.gsub(/[^[:alnum:]]/, '')
end
|
Instance Method Details
#default_model_file ⇒ Object
261
262
263
264
265
266
267
268
269
|
# File 'lib/excite/crfparser.rb', line 261
def default_model_file
if @mode == :string
MODEL_FILE
elsif @mode == :html
HTML_MODEL_FILE
else
raise "Unknown mode: #{@mode}"
end
end
|
#default_tagged_references ⇒ Object
251
252
253
254
255
256
257
258
259
|
# File 'lib/excite/crfparser.rb', line 251
def default_tagged_references
if @mode == :string
TAGGED_REFERENCES
elsif @mode == :html
TAGGED_HTML_REFERENCES
else
raise "Unknown mode: #{@mode}"
end
end
|
#default_template_file ⇒ Object
271
272
273
274
275
276
277
278
279
|
# File 'lib/excite/crfparser.rb', line 271
def default_template_file
if @mode == :string
TEMPLATE_FILE
elsif @mode == :html
HTML_TEMPLATE_FILE
else
raise "Unknown mode: #{@mode}"
end
end
|
#eval_crfpp(feat_seq, model) ⇒ Object
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
# File 'lib/excite/crfparser.rb', line 60
def eval_crfpp(feat_seq, model)
model.clear
feat_seq.each {|vec|
line = vec.join(" ").strip
raise unless model.add(line)
}
raise unless model.parse
tags = []
probs = {}
feat_seq.length.times {|i|
tags << model.y2(i)
probs[model.y2(i)] ||= 1
probs[model.y2(i)] *= model.prob(i)
}
[tags, model.prob, probs]
end
|
#html_str_2_tokens(str) ⇒ Object
168
169
170
171
172
173
174
175
176
177
178
179
180
|
# File 'lib/excite/crfparser.rb', line 168
def html_str_2_tokens(str)
html = Nokogiri::HTML.fragment(str.gsub('>', '> '))
tokens = []
html.traverse do |node|
if node.text?
tokens += html_text_node_2_tokens(node)
elsif node.name == 'br'
tokens << Token.for_br(node)
end
end
tokens
end
|
#html_text_node_2_tokens(node) ⇒ Object
182
183
184
185
186
187
188
189
|
# File 'lib/excite/crfparser.rb', line 182
def html_text_node_2_tokens(node)
text = CGI.unescapeHTML(node.text)
return [] if text.blank?
tokens = text_str_2_tokens(text)
tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
tokens
end
|
#model ⇒ Object
41
42
43
|
# File 'lib/excite/crfparser.rb', line 41
def model
@model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
end
|
81
82
83
84
|
# File 'lib/excite/crfparser.rb', line 81
def normalize_input_author(str)
return nil if str.blank?
str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
end
|
#parse(str, presumed_author = nil) ⇒ Object
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
# File 'lib/excite/crfparser.rb', line 45
def parse(str, presumed_author=nil)
raw_string = str.dup
toks, features = str_2_features(str, false, presumed_author)
tags, overall_prob, tag_probs = eval_crfpp(features, model)
ret = {}
tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
ret.each { |k, v| ret[k] = v.join('').strip }
normalize_fields(ret)
ret['raw_string'] = raw_string
[ret, overall_prob, tag_probs]
end
|
#prepare_token_data(raw_string, training = false) ⇒ Object
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
# File 'lib/excite/crfparser.rb', line 86
def prepare_token_data(raw_string, training=false)
if training
tags = tagged_string_2_tags(raw_string.strip)
labels, raw_string, joined_tokens = [], '', ''
tags.each do |tag|
raw = CGI.unescapeHTML(tag.inner_html)
label = tag.name
raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)
toks = str_2_tokens(raw)
labels << [label, joined_tokens.length]
joined_tokens += toks.map(&:raw).join
raw_string += "\n#{raw}"
end
end
tokens = str_2_tokens(raw_string.strip)
if training
joined_tokens = ''
label, _ = labels.shift
next_label, end_idx = labels.shift unless labels.empty?
tokens.each do |tok|
tok.label = label
joined_tokens += tok.raw
if joined_tokens.length == end_idx
label = next_label
next_label, end_idx = labels.shift unless labels.empty?
elsif joined_tokens.length > end_idx && !labels.empty?
raise "Tokens do not match labels"
end
end
raise "Unused label" unless labels.empty?
end
if @mode == :html
tokens = tokens.drop_while { |t| t.part_of_speech == 'br' }
tokens.reverse!
tokens = tokens.drop_while { |t| t.part_of_speech == 'br' }
tokens.reverse!
end
self.clear
return tokens
end
|
#recognized_labels ⇒ Object
158
159
160
161
162
163
164
165
166
|
# File 'lib/excite/crfparser.rb', line 158
def recognized_labels
if @mode == :string
["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
elsif @mode == :html
["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
else
[]
end
end
|
#str_2_features(raw_string, training = false, presumed_author = nil) ⇒ Object
calculate features on the full citation string
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
|
# File 'lib/excite/crfparser.rb', line 198
def str_2_features(raw_string, training=false, presumed_author=nil)
features = []
tokens = prepare_token_data(raw_string, training)
author_names = normalize_input_author(presumed_author)
tokens.each_with_index do |tok, toki|
raise "All tokens must be labeled" if training && tok.label.nil?
feats = {}
@token_features.each {|f|
feats[f] = self.send(f, tokens, toki, author_names)
}
features << [tok.raw]
@feature_order.each {|f| features.last << feats[f]}
features.last << tok.label if training
end
[tokens, features]
end
|
#str_2_tokens(str) ⇒ Object
148
149
150
151
152
153
154
155
156
|
# File 'lib/excite/crfparser.rb', line 148
def str_2_tokens(str)
if @mode == :html
toks = html_str_2_tokens(str)
elsif @mode == :string
toks = text_str_2_tokens(str)
end
toks.reject { |t| t.empty? }
end
|
142
143
144
145
146
|
# File 'lib/excite/crfparser.rb', line 142
def tagged_string_2_tags(str)
str = "<string>#{str}</string>"
node = Nokogiri::XML.fragment(str).css('string')
node.children.reject(&:text?)
end
|
#tagger ⇒ Object
138
139
140
|
# File 'lib/excite/crfparser.rb', line 138
def tagger
@tagger ||= EngTagger.new
end
|
#text_str_2_tokens(text) ⇒ Object
191
192
193
194
195
|
# File 'lib/excite/crfparser.rb', line 191
def text_str_2_tokens(text)
tagged = tagger.add_tags(normalize_citation(text))
tags = tagged_string_2_tags(tagged.gsub('&','&')) tags.map { |tag| Token.new(tag.text, tag.name) }
end
|
#train(tagged_refs = nil, model = nil, template = nil, training_data = nil) ⇒ Object
238
239
240
241
242
243
244
245
246
247
248
249
|
# File 'lib/excite/crfparser.rb', line 238
def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
tagged_refs ||= default_tagged_references
model ||= default_model_file
template ||= default_template_file
if training_data.nil?
training_data = TRAINING_DATA
write_training_file(tagged_refs, training_data)
end
`crf_learn #{template} #{training_data} #{model} -f3 1>&2`
end
|
#write_training_file(tagged_refs = nil, training_data = TRAINING_DATA) ⇒ Object
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
|
# File 'lib/excite/crfparser.rb', line 221
def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
tagged_refs ||= default_tagged_references
fin = File.open(tagged_refs, 'r')
fout = File.open(training_data, 'w')
x = 0
while l = fin.gets
_, data = str_2_features(l.strip, true)
data.each {|line| fout.write("#{line.join(" ")}\n") }
fout.write("\n")
end
fin.close
fout.flush
fout.close
end
|