Class: Anystyle::Parser::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/anystyle/parser/parser.rb

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Parser


57
58
59
60
61
62
63
# File 'lib/anystyle/parser/parser.rb', line 57

def initialize(options = {})
  @options = Parser.defaults.merge(options)

  reload

  @normalizer = Normalizer.instance
end

Class Attribute Details

.defaultsObject (readonly)

Returns the value of attribute defaults


36
37
38
# File 'lib/anystyle/parser/parser.rb', line 36

def defaults
  @defaults
end

.featureObject (readonly)

Returns the value of attribute feature


36
37
38
# File 'lib/anystyle/parser/parser.rb', line 36

def feature
  @feature
end

.featuresObject (readonly)

Returns the value of attribute features


36
37
38
# File 'lib/anystyle/parser/parser.rb', line 36

def features
  @features
end

.formatsObject (readonly)

Returns the value of attribute formats


36
37
38
# File 'lib/anystyle/parser/parser.rb', line 36

def formats
  @formats
end

Instance Attribute Details

#modelObject

Returns the value of attribute model


55
56
57
# File 'lib/anystyle/parser/parser.rb', line 55

def model
  @model
end

#normalizerObject

Returns the value of attribute normalizer


55
56
57
# File 'lib/anystyle/parser/parser.rb', line 55

def normalizer
  @normalizer
end

#optionsObject (readonly)

Returns the value of attribute options


53
54
55
# File 'lib/anystyle/parser/parser.rb', line 53

def options
  @options
end

Class Method Details

.instanceObject

Returns a default parser instance


43
44
45
# File 'lib/anystyle/parser/parser.rb', line 43

def instance
  @instance ||= new
end

.language(string) ⇒ Object


47
48
49
50
# File 'lib/anystyle/parser/parser.rb', line 47

def language(string)
  return unless @language_detector
  @language_detector.detect string
end

.load(path) ⇒ Object


38
39
40
# File 'lib/anystyle/parser/parser.rb', line 38

def load(path)
  new :model => path
end

Instance Method Details

#classify(hash) ⇒ Object


205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/anystyle/parser/parser.rb', line 205

def classify(hash)
  return hash if hash.has_key?(:type)

  keys = hash.keys
  text = hash.values.flatten.join

  case
  when keys.include?(:journal)
    hash[:type] = :article
  when text =~ /proceedings/i
    hash[:type] = :inproceedings
  when keys.include?(:medium)
    if hash[:medium].to_s =~ /dvd|video|vhs|motion|television/i
      hash[:type] = :motion_picture
    else
      hash[:type] = hash[:medium]
    end
  when keys.include?(:booktitle), keys.include?(:source)
    hash[:type] = :incollection
  when keys.include?(:publisher)
    hash[:type] = :book
  when text =~ /ph(\.\s*)?d|diss(\.|ertation)|thesis/i
    hash[:type] = :thesis
  when text =~ /\b[Pp]atent\b/
    hash[:type] = :patent
  when text =~ /\b[Pp]ersonal [Cc]ommunication\b/
    hash[:type] = :personal_communication
  when keys.include?(:authority)
    hash[:type] = :techreport
  when text =~ /interview/i
    hash[:type] = :interview
  when text =~ /videotape/i
    hash[:type] = :videotape
  when text =~ /unpublished/i
    hash[:type] = :unpublished
  else
    hash[:type] = :misc
  end

  hash
end

#expand(token, sequence = [], offset = 0, label = nil) ⇒ Object

Expands the passed-in token string by appending a space separated list of all features for the token.


148
149
150
151
152
153
# File 'lib/anystyle/parser/parser.rb', line 148

def expand(token, sequence = [], offset = 0, label = nil)
  f = features_for(token, strip(token), sequence, offset)
  f.unshift(token)
  f.push(label) unless label.nil?
  f.join(' ')
end

#label(input, labelled = false) ⇒ Object

Returns an array of label/segment pairs for each line in the passed-in string.


81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/anystyle/parser/parser.rb', line 81

def label(input, labelled = false)
  model.label(prepare(input, labelled)).map! do |sequence|
    sequence.inject([]) do |ts, (token, label)|
      token, label = token[/^\S+/], label.to_sym
      if (prev = ts[-1]) && prev[0] == label
        prev[1] << ' ' << token
        ts
      else
        ts << [label, token]
      end
    end
  end
end

#learn(input) ⇒ Object

Trains the model by appending the training data without truncating the current model.

See Also:


171
172
173
# File 'lib/anystyle/parser/parser.rb', line 171

def learn(input)
  train(input, false)
end

#lines(string) ⇒ Object


127
128
129
# File 'lib/anystyle/parser/parser.rb', line 127

def lines(string)
  string.split(/[ \t]*[\n\r]\s*/)
end

#localize(hash) ⇒ Object


193
194
195
196
197
198
199
200
201
202
203
# File 'lib/anystyle/parser/parser.rb', line 193

def localize(hash)
  return hash if hash.has_key?(:language)

  text = hash.values_at(
    :title, :booktitle, :location, :publisher
  ).compact.join(' ')

  hash[:language] = Parser.language(text) unless text.empty?

  hash
end

#normalize(hash) ⇒ Object


180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/anystyle/parser/parser.rb', line 180

def normalize(hash)
  hash.keys.each do |label|
    begin
      normalizer.send("normalize_#{label}", hash)
    rescue => e
      warn e.message
    end
  end

  classify hash
  localize hash
end

#parse(input, format = options[:format]) ⇒ Object

Raises:

  • (ArgumentError)

71
72
73
74
75
76
77
78
# File 'lib/anystyle/parser/parser.rb', line 71

def parse(input, format = options[:format])
  formatter = "format_#{format}".to_sym

  raise ArgumentError, "format not supported: #{formatter}" unless
    respond_to?(formatter, true)

  send(formatter, label(input))
end

#prepare(input, tagged = false) ⇒ Object

Prepares the passed-in string for processing by a CRF tagger. The string is split into separate lines; each line is tokenized and expanded. Returns an array of sequence arrays that can be labelled by the CRF model.

If the string is marked as being tagged by passing true as the second argument, training labels will be extracted from the string and appended after feature expansion. The returned sequence arrays can be used for training or testing the CRF model.


140
141
142
143
# File 'lib/anystyle/parser/parser.rb', line 140

def prepare(input, tagged = false)
  string = input_to_s(input)
  tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
end

#reloadObject


65
66
67
68
69
# File 'lib/anystyle/parser/parser.rb', line 65

def reload
  @model = Wapiti.load(@options[:model])
  @model.options.update_attributes @options
  self
end

#test(input) ⇒ Object


175
176
177
178
# File 'lib/anystyle/parser/parser.rb', line 175

def test(input)
  model.options.check!
  model.label(prepare(input, true))
end

#tokenize(string, tagged = false) ⇒ Object

Returns an array of tokens for each line of input.

If the passed-in string is marked as being tagged, extracts labels from the string and returns an array of token/label pairs for each line of input.


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/anystyle/parser/parser.rb', line 100

def tokenize(string, tagged = false)
  if tagged
    lines(string).each_with_index.map do |s,i|
      tt, tokens, tags = s.split(options[:tagged_separator]), [], []

      tt.each do |token|
        case token
        when /^$/
          # skip
        when /^<([^\/>][^>]*)>$/
          tags << $1
        when /^<\/([^>]+)>$/
          unless (tag = tags.pop) == $1
            raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
          end
        else
          tokens << [decode_xml_text(token), (tags[-1] || :unknown).to_sym]
        end
      end

      tokens
    end
  else
    lines(string).map { |s| s.split(options[:separator]).reject(&:empty?) }
  end
end

#train(input = options[:training_data], truncate = true) ⇒ Object


155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/anystyle/parser/parser.rb', line 155

def train(input = options[:training_data], truncate = true)
  if truncate
    @model = Wapiti::Model.new(options.reject { |k,_| k == :model })
  end

  unless input.nil? || input.empty?
    @model.train(prepare(input, true))
  end

  @model.path = options[:model]
  @model
end