Class: Anystyle::Parser::Parser
- Inherits:
-
Object
- Object
- Anystyle::Parser::Parser
- Defined in:
- lib/anystyle/parser/parser.rb
Class Attribute Summary collapse
-
.defaults ⇒ Object
readonly
Returns the value of attribute defaults.
-
.feature ⇒ Object
readonly
Returns the value of attribute feature.
-
.features ⇒ Object
readonly
Returns the value of attribute features.
-
.formats ⇒ Object
readonly
Returns the value of attribute formats.
Instance Attribute Summary collapse
-
#model ⇒ Object
Returns the value of attribute model.
-
#normalizer ⇒ Object
Returns the value of attribute normalizer.
-
#options ⇒ Object
readonly
Returns the value of attribute options.
Class Method Summary collapse
-
.instance ⇒ Object
Returns a default parser instance.
- .language(string) ⇒ Object
- .load(path) ⇒ Object
Instance Method Summary collapse
- #classify(hash) ⇒ Object
-
#expand(token, sequence = [], offset = 0, label = nil) ⇒ Object
Expands the passed-in token string by appending a space separated list of all features for the token.
-
#initialize(options = {}) ⇒ Parser
constructor
A new instance of Parser.
-
#label(input, labelled = false) ⇒ Object
Returns an array of label/segment pairs for each line in the passed-in string.
-
#learn(input) ⇒ Object
Trains the model by appending the training data without truncating the current model.
- #lines(string) ⇒ Object
- #localize(hash) ⇒ Object
- #normalize(hash) ⇒ Object
- #parse(input, format = ) ⇒ Object
-
#prepare(input, tagged = false) ⇒ Object
Prepares the passed-in string for processing by a CRF tagger.
- #reload ⇒ Object
- #test(input) ⇒ Object
-
#tokenize(string, tagged = false) ⇒ Object
Returns an array of tokens for each line of input.
- #train(input = , truncate = true) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Parser
Returns a new instance of Parser.
57 58 59 60 61 62 63 |
# File 'lib/anystyle/parser/parser.rb', line 57 def initialize( = {}) @options = Parser.defaults.merge() reload @normalizer = Normalizer.instance end |
Class Attribute Details
.defaults ⇒ Object (readonly)
Returns the value of attribute defaults.
36 37 38 |
# File 'lib/anystyle/parser/parser.rb', line 36 def defaults @defaults end |
.feature ⇒ Object (readonly)
Returns the value of attribute feature.
36 37 38 |
# File 'lib/anystyle/parser/parser.rb', line 36 def feature @feature end |
.features ⇒ Object (readonly)
Returns the value of attribute features.
36 37 38 |
# File 'lib/anystyle/parser/parser.rb', line 36 def features @features end |
.formats ⇒ Object (readonly)
Returns the value of attribute formats.
36 37 38 |
# File 'lib/anystyle/parser/parser.rb', line 36 def formats @formats end |
Instance Attribute Details
#model ⇒ Object
Returns the value of attribute model.
55 56 57 |
# File 'lib/anystyle/parser/parser.rb', line 55 def model @model end |
#normalizer ⇒ Object
Returns the value of attribute normalizer.
55 56 57 |
# File 'lib/anystyle/parser/parser.rb', line 55 def normalizer @normalizer end |
#options ⇒ Object (readonly)
Returns the value of attribute options.
53 54 55 |
# File 'lib/anystyle/parser/parser.rb', line 53 def @options end |
Class Method Details
.instance ⇒ Object
Returns a default parser instance
43 44 45 |
# File 'lib/anystyle/parser/parser.rb', line 43 def instance @instance ||= new end |
.language(string) ⇒ Object
47 48 49 50 |
# File 'lib/anystyle/parser/parser.rb', line 47 def language(string) return unless @language_detector @language_detector.detect string end |
.load(path) ⇒ Object
38 39 40 |
# File 'lib/anystyle/parser/parser.rb', line 38 def load(path) new :model => path end |
Instance Method Details
#classify(hash) ⇒ Object
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
# File 'lib/anystyle/parser/parser.rb', line 205 def classify(hash) return hash if hash.has_key?(:type) keys = hash.keys text = hash.values.flatten.join case when keys.include?(:journal) hash[:type] = :article when text =~ /proceedings/i hash[:type] = :inproceedings when keys.include?(:medium) if hash[:medium].to_s =~ /dvd|video|vhs|motion|television/i hash[:type] = :motion_picture else hash[:type] = hash[:medium] end when keys.include?(:booktitle), keys.include?(:source) hash[:type] = :incollection when keys.include?(:publisher) hash[:type] = :book when text =~ /ph(\.\s*)?d|diss(\.|ertation)|thesis/i hash[:type] = :thesis when text =~ /\b[Pp]atent\b/ hash[:type] = :patent when text =~ /\b[Pp]ersonal [Cc]ommunication\b/ hash[:type] = :personal_communication when keys.include?(:authority) hash[:type] = :techreport when text =~ /interview/i hash[:type] = :interview when text =~ /videotape/i hash[:type] = :videotape when text =~ /unpublished/i hash[:type] = :unpublished else hash[:type] = :misc end hash end |
#expand(token, sequence = [], offset = 0, label = nil) ⇒ Object
Expands the passed-in token string by appending a space separated list of all features for the token.
148 149 150 151 152 153 |
# File 'lib/anystyle/parser/parser.rb', line 148 def (token, sequence = [], offset = 0, label = nil) f = features_for(token, strip(token), sequence, offset) f.unshift(token) f.push(label) unless label.nil? f.join(' ') end |
#label(input, labelled = false) ⇒ Object
Returns an array of label/segment pairs for each line in the passed-in string.
81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/anystyle/parser/parser.rb', line 81 def label(input, labelled = false) model.label(prepare(input, labelled)).map! do |sequence| sequence.inject([]) do |ts, (token, label)| token, label = token[/^\S+/], label.to_sym if (prev = ts[-1]) && prev[0] == label prev[1] << ' ' << token ts else ts << [label, token] end end end end |
#learn(input) ⇒ Object
Trains the model by appending the training data without truncating the current model.
171 172 173 |
# File 'lib/anystyle/parser/parser.rb', line 171 def learn(input) train(input, false) end |
#lines(string) ⇒ Object
127 128 129 |
# File 'lib/anystyle/parser/parser.rb', line 127 def lines(string) string.split(/[ \t]*[\n\r]\s*/) end |
#localize(hash) ⇒ Object
193 194 195 196 197 198 199 200 201 202 203 |
# File 'lib/anystyle/parser/parser.rb', line 193 def localize(hash) return hash if hash.has_key?(:language) text = hash.values_at( :title, :booktitle, :location, :publisher ).compact.join(' ') hash[:language] = Parser.language(text) unless text.empty? hash end |
#normalize(hash) ⇒ Object
180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/anystyle/parser/parser.rb', line 180 def normalize(hash) hash.keys.each do |label| begin normalizer.send("normalize_#{label}", hash) rescue => e warn e. end end classify hash localize hash end |
#parse(input, format = ) ⇒ Object
71 72 73 74 75 76 77 78 |
# File 'lib/anystyle/parser/parser.rb', line 71 def parse(input, format = [:format]) formatter = "format_#{format}".to_sym raise ArgumentError, "format not supported: #{formatter}" unless respond_to?(formatter, true) send(formatter, label(input)) end |
#prepare(input, tagged = false) ⇒ Object
Prepares the passed-in string for processing by a CRF tagger. The string is split into separate lines; each line is tokenized and expanded. Returns an array of sequence arrays that can be labelled by the CRF model.
If the string is marked as being tagged by passing true
as the second argument, training labels will be extracted from the string and appended after feature expansion. The returned sequence arrays can be used for training or testing the CRF model.
140 141 142 143 |
# File 'lib/anystyle/parser/parser.rb', line 140 def prepare(input, tagged = false) string = input_to_s(input) tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| (t,tk,i,l) } } end |
#reload ⇒ Object
65 66 67 68 69 |
# File 'lib/anystyle/parser/parser.rb', line 65 def reload @model = Wapiti.load(@options[:model]) @model..update_attributes @options self end |
#test(input) ⇒ Object
175 176 177 178 |
# File 'lib/anystyle/parser/parser.rb', line 175 def test(input) model..check! model.label(prepare(input, true)) end |
#tokenize(string, tagged = false) ⇒ Object
Returns an array of tokens for each line of input.
If the passed-in string is marked as being tagged, extracts labels from the string and returns an array of token/label pairs for each line of input.
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/anystyle/parser/parser.rb', line 100 def tokenize(string, tagged = false) if tagged lines(string).each_with_index.map do |s,i| tt, tokens, = s.split([:tagged_separator]), [], [] tt.each do |token| case token when /^$/ # skip when /^<([^\/>][^>]*)>$/ << $1 when /^<\/([^>]+)>$/ unless (tag = .pop) == $1 raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})" end else tokens << [decode_xml_text(token), ([-1] || :unknown).to_sym] end end tokens end else lines(string).map { |s| s.split([:separator]).reject(&:empty?) } end end |
#train(input = , truncate = true) ⇒ Object
155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/anystyle/parser/parser.rb', line 155 def train(input = [:training_data], truncate = true) if truncate @model = Wapiti::Model.new(.reject { |k,_| k == :model }) end unless input.nil? || input.empty? @model.train(prepare(input, true)) end @model.path = [:model] @model end |