Class: Excite::CRFParser

Inherits:
Object
  • Object
show all
Includes:
Postprocessor, Preprocessor, TokenFeatures
Defined in:
lib/excite/crfparser.rb

Constant Summary collapse

DIR =
File.dirname(__FILE__)
TAGGED_REFERENCES =
"#{DIR}/resources/trainingdata/tagged_references.txt"
TAGGED_HTML_REFERENCES =
"#{DIR}/resources/trainingdata/tagged_html_references.txt"
TRAINING_DATA =
"#{DIR}/resources/trainingdata/training_data.txt"
MODEL_FILE =
"#{DIR}/resources/model"
HTML_MODEL_FILE =
"#{DIR}/resources/html_model"
TEMPLATE_FILE =
"#{DIR}/resources/parsCit.template"
HTML_TEMPLATE_FILE =
"#{DIR}/resources/html.template"
CONFIG_FILE =
"#{DIR}/../../config/parscit_features.yml"

Constants included from Preprocessor

Preprocessor::CLEANUP_RULES_FILE, Preprocessor::MARKER_TYPES

Constants included from TokenFeatures

TokenFeatures::DICT, TokenFeatures::NODE_TYPES_BY_NAME

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Postprocessor

#join_multi_word_names, #method_missing, #normalize, #normalize_author, #normalize_author_name, #normalize_date, #normalize_fields, #normalize_pages, #normalize_title, #normalize_volume, #pairable_quote_chars, #repair_and_tokenize_author_text

Methods included from Preprocessor

#cleanup_rules, #normalize_citation, #normalize_cite_text, #segment_citations, #split_citations_by_marker

Methods included from TokenFeatures

#a_is_in_dict, #capitalization, #clear, #dict_status, #firstName, #first_1_char, #first_2_chars, #first_3_chars, #first_4_chars, #first_5_chars, #is_in, #is_proceeding, #lastName, #last_1_char, #last_2_chars, #last_3_chars, #last_4_chars, #last_char, #location, #location_in_node, #monthName, #numbers, #part_of_speech, #placeName, #possible_chapter, #possible_editor, #possible_volume, #publisherName, #punct, #tag_name, #toklcnp

Constructor Details

#initialize(mode = :string) ⇒ CRFParser

Feature functions must be performed in alphabetical order, since later functions may depend on earlier ones. TODO This seems pretty confusing and dependent on the current features.



32
33
34
35
36
37
38
39
# File 'lib/excite/crfparser.rb', line 32

def initialize(mode=:string)
  @mode = mode

  f = File.open(CONFIG_FILE, 'r')
  hsh = YAML::load(f)[mode.to_s]
  @feature_order = hsh["feature_order"].map(&:to_sym)
  @token_features = hsh["feature_order"].sort.map(&:to_sym)
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method in the class Excite::Postprocessor

Instance Attribute Details

#feature_orderObject (readonly)

Returns the value of attribute feature_order.



12
13
14
# File 'lib/excite/crfparser.rb', line 12

def feature_order
  @feature_order
end

#token_featuresObject (readonly)

Returns the value of attribute token_features.



13
14
15
# File 'lib/excite/crfparser.rb', line 13

def token_features
  @token_features
end

Class Method Details

.strip_punct(str) ⇒ Object



77
78
79
80
81
# File 'lib/excite/crfparser.rb', line 77

def self.strip_punct(str)
  toknp = str.gsub(/[^\w]/, '')
  toknp = "EMPTY" if toknp.blank? # TODO Seems maybe hacky
  toknp
end

Instance Method Details

#default_model_fileObject



251
252
253
254
255
256
257
258
259
# File 'lib/excite/crfparser.rb', line 251

def default_model_file
  if @mode == :string
    MODEL_FILE
  elsif @mode == :html
    HTML_MODEL_FILE
  else
    raise "Unknown mode: #{@mode}"
  end
end

#default_tagged_referencesObject



241
242
243
244
245
246
247
248
249
# File 'lib/excite/crfparser.rb', line 241

def default_tagged_references
  if @mode == :string
    TAGGED_REFERENCES
  elsif @mode == :html
    TAGGED_HTML_REFERENCES
  else
    raise "Unknown mode: #{@mode}"
  end
end

#default_template_fileObject



261
262
263
264
265
266
267
268
269
# File 'lib/excite/crfparser.rb', line 261

def default_template_file
  if @mode == :string
    TEMPLATE_FILE
  elsif @mode == :html
    HTML_TEMPLATE_FILE
  else
    raise "Unknown mode: #{@mode}"
  end
end

#eval_crfpp(feat_seq, model) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/excite/crfparser.rb', line 60

def eval_crfpp(feat_seq, model)
  model.clear
  feat_seq.each {|vec|
    line = vec.join(" ").strip
    raise unless model.add(line)
  }
  raise unless model.parse
  tags = []
  probs = {}
  feat_seq.length.times {|i|
    tags << model.y2(i)
    probs[model.y2(i)] ||= 1
    probs[model.y2(i)] *= model.prob(i)
  }
  [tags, model.prob, probs]
end

#html_str_2_tokens(str) ⇒ Object



162
163
164
165
166
167
168
169
170
# File 'lib/excite/crfparser.rb', line 162

def html_str_2_tokens(str)
  html = Nokogiri::HTML.fragment(str.gsub('>', '> ')) # gsub to ensure strings in separate tags are always separate tokens even if HTML is bad

  tokens = []
  html.traverse do |node|
    tokens += html_text_node_2_tokens(node) if node.text?
  end
  tokens
end

#html_text_node_2_tokens(node) ⇒ Object



172
173
174
175
176
177
178
179
# File 'lib/excite/crfparser.rb', line 172

def html_text_node_2_tokens(node)
  text = CGI.unescapeHTML(node.text)
  return [] if text.blank?

  tokens = text_str_2_tokens(text)
  tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
  tokens
end

#modelObject



41
42
43
# File 'lib/excite/crfparser.rb', line 41

def model
  @model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
end

#normalize_input_author(str) ⇒ Object



83
84
85
86
# File 'lib/excite/crfparser.rb', line 83

def normalize_input_author(str)
  return nil if str.blank?
  str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
end

#parse(str, presumed_author = nil) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/excite/crfparser.rb', line 45

def parse(str, presumed_author=nil)
  raw_string = str.dup

  toks, features = str_2_features(str, false, presumed_author)
  tags, overall_prob, tag_probs = eval_crfpp(features, model)

  ret = {}
  tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
  ret.each { |k, v| ret[k] = v.join('').strip }

  normalize_fields(ret)
  ret['raw_string'] = raw_string
  [ret, overall_prob, tag_probs]
end

#prepare_token_data(raw_string, training = false) ⇒ Object



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/excite/crfparser.rb', line 88

def prepare_token_data(raw_string, training=false)
  if training
    tags = tagged_string_2_tags(raw_string.strip)

    labels, raw_string, joined_tokens = [], '', ''
    tags.each do |tag|
      raw = CGI.unescapeHTML(tag.inner_html)

      label = tag.name
      raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)

      toks = str_2_tokens(raw)

      labels << [label, joined_tokens.length]
      joined_tokens += toks.map(&:raw).join
      raw_string += "\n#{raw}"
    end
  end

  tokens = str_2_tokens(raw_string.strip)

  if training
    joined_tokens = ''
    label, _ = labels.shift
    next_label, end_idx = labels.shift unless labels.empty?

    tokens.each do |tok|
      tok.label = label
      joined_tokens += tok.raw
      if joined_tokens.length == end_idx
        label = next_label
        next_label, end_idx = labels.shift unless labels.empty?
      elsif joined_tokens.length > end_idx && !labels.empty?
        raise "Tokens do not match labels"
      end
    end
    raise "Unused label" unless labels.empty?
  end

  self.clear

  return tokens
end

#recognized_labelsObject



152
153
154
155
156
157
158
159
160
# File 'lib/excite/crfparser.rb', line 152

def recognized_labels
  if @mode == :string
    ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
  elsif @mode == :html
    ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
  else
    []
  end
end

#str_2_features(raw_string, training = false, presumed_author = nil) ⇒ Object

calculate features on the full citation string



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/excite/crfparser.rb', line 188

def str_2_features(raw_string, training=false, presumed_author=nil)
  features = []
  tokens = prepare_token_data(raw_string, training)

  author_names = normalize_input_author(presumed_author)

  tokens.each_with_index do |tok, toki|
    raise "All tokens must be labeled" if training && tok.label.nil?

    feats = {}

    @token_features.each {|f|
      feats[f] = self.send(f, tokens, toki, author_names)
    }

    features << [tok.raw]
    @feature_order.each {|f| features.last << feats[f]}
    features.last << tok.label if training
  end

  [tokens, features]
end

#str_2_tokens(str) ⇒ Object



142
143
144
145
146
147
148
149
150
# File 'lib/excite/crfparser.rb', line 142

def str_2_tokens(str)
  if @mode == :html
    toks = html_str_2_tokens(str)
  elsif @mode == :string
    toks = text_str_2_tokens(str)
  end

  toks.reject { |t| t.empty? }
end

#tagged_string_2_tags(str) ⇒ Object



136
137
138
139
140
# File 'lib/excite/crfparser.rb', line 136

def tagged_string_2_tags(str)
  str = "<string>#{str}</string>"
  node = Nokogiri::XML.fragment(str).css('string')
  node.children.reject(&:text?)
end

#taggerObject



132
133
134
# File 'lib/excite/crfparser.rb', line 132

def tagger
  @tagger ||= EngTagger.new
end

#text_str_2_tokens(text) ⇒ Object



181
182
183
184
185
# File 'lib/excite/crfparser.rb', line 181

def text_str_2_tokens(text)
  tagged = tagger.add_tags(normalize_citation(text))
  tags = tagged_string_2_tags(tagged.gsub('&','&amp;')) # EngTagger has legitimately added angle brackets which are meaningful in XML, but angle-brackets predate EngTagger and are semantic
  tags.map { |tag| Token.new(tag.text, tag.name) }
end

#train(tagged_refs = nil, model = nil, template = nil, training_data = nil) ⇒ Object



228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/excite/crfparser.rb', line 228

def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
  tagged_refs ||= default_tagged_references
  model ||= default_model_file
  template ||= default_template_file

  if training_data.nil?
    training_data = TRAINING_DATA
    write_training_file(tagged_refs, training_data)
  end

  `crf_learn #{template} #{training_data} #{model} -f3 1>&2`
end

#write_training_file(tagged_refs = nil, training_data = TRAINING_DATA) ⇒ Object



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/excite/crfparser.rb', line 211

def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
  tagged_refs ||= default_tagged_references

  fin = File.open(tagged_refs, 'r')
  fout = File.open(training_data, 'w')
  x = 0
  while l = fin.gets
    _, data = str_2_features(l.strip, true)
    data.each {|line| fout.write("#{line.join(" ")}\n") }
    fout.write("\n")
  end

  fin.close
  fout.flush
  fout.close
end