Class: Anystyle::Parser::Parser
- Inherits:
-
Object
- Object
- Anystyle::Parser::Parser
- Defined in:
- lib/anystyle/parser/parser.rb
Class Attribute Summary collapse
-
.defaults ⇒ Object
readonly
Returns the value of attribute defaults.
-
.feature ⇒ Object
readonly
Returns the value of attribute feature.
-
.features ⇒ Object
readonly
Returns the value of attribute features.
-
.formats ⇒ Object
readonly
Returns the value of attribute formats.
-
.models ⇒ Object
readonly
Returns the value of attribute models.
Instance Attribute Summary collapse
-
#model ⇒ Object
Returns the value of attribute model.
-
#normalizer ⇒ Object
Returns the value of attribute normalizer.
-
#options ⇒ Object
readonly
Returns the value of attribute options.
Class Method Summary collapse
-
.instance ⇒ Object
Returns a default parser instance.
- .load(path) ⇒ Object
Instance Method Summary collapse
- #classify(hash) ⇒ Object
-
#expand(token, sequence = [], offset = 0, label = nil) ⇒ Object
Expands the passed-in token string by appending a space separated list of all features for the token.
-
#initialize(options = {}) ⇒ Parser
constructor
A new instance of Parser.
-
#label(input, labelled = false) ⇒ Object
Returns an array of label/segment pairs for each line in the passed-in string.
- #normalize(hash) ⇒ Object
- #parse(input, format = ) ⇒ Object
-
#prepare(input, tagged = false) ⇒ Object
Prepares the passed-in string for processing by a CRF tagger.
- #test(input) ⇒ Object
-
#tokenize(string, tagged = false) ⇒ Object
Returns an array of tokens for each line of input.
- #train(input, truncate = false) ⇒ Object
Constructor Details
Class Attribute Details
.defaults ⇒ Object (readonly)
Returns the value of attribute defaults.
27 28 29 |
# File 'lib/anystyle/parser/parser.rb', line 27 def defaults @defaults end |
.feature ⇒ Object (readonly)
Returns the value of attribute feature.
27 28 29 |
# File 'lib/anystyle/parser/parser.rb', line 27 def feature @feature end |
.features ⇒ Object (readonly)
Returns the value of attribute features.
27 28 29 |
# File 'lib/anystyle/parser/parser.rb', line 27 def features @features end |
.formats ⇒ Object (readonly)
Returns the value of attribute formats.
27 28 29 |
# File 'lib/anystyle/parser/parser.rb', line 27 def formats @formats end |
.models ⇒ Object (readonly)
Returns the value of attribute models.
27 28 29 |
# File 'lib/anystyle/parser/parser.rb', line 27 def models @models end |
Instance Attribute Details
#model ⇒ Object
Returns the value of attribute model.
44 45 46 |
# File 'lib/anystyle/parser/parser.rb', line 44 def model @model end |
#normalizer ⇒ Object
Returns the value of attribute normalizer.
44 45 46 |
# File 'lib/anystyle/parser/parser.rb', line 44 def normalizer @normalizer end |
#options ⇒ Object (readonly)
Returns the value of attribute options.
42 43 44 |
# File 'lib/anystyle/parser/parser.rb', line 42 def @options end |
Class Method Details
.instance ⇒ Object
Returns a default parser instance
36 37 38 |
# File 'lib/anystyle/parser/parser.rb', line 36 def instance @instance ||= new end |
.load(path) ⇒ Object
29 30 31 32 33 |
# File 'lib/anystyle/parser/parser.rb', line 29 def load(path) p = new p.model = Wapiti.load(path) p end |
Instance Method Details
#classify(hash) ⇒ Object
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
# File 'lib/anystyle/parser/parser.rb', line 155 def classify(hash) return hash if hash.has_key?(:type) keys = hash.keys text = hash.values.flatten.join case when keys.include?(:journal) hash[:type] = :article when text =~ /proceedings/i hash[:type] = :inproceedings when keys.include?(:booktitle), keys.include?(:container) hash[:type] = :incollection when keys.include?(:publisher) hash[:type] = :book when keys.include?(:institution) hash[:type] = :techreport when keys.include?(:school) hash[:type] = :mastersthesis when text =~ /unpublished/i hash[:type] = :unpublished else hash[:type] = :misc end hash end |
#expand(token, sequence = [], offset = 0, label = nil) ⇒ Object
Expands the passed-in token string by appending a space separated list of all features for the token.
126 127 128 129 130 131 |
# File 'lib/anystyle/parser/parser.rb', line 126 def (token, sequence = [], offset = 0, label = nil) f = features_for(token, strip(token), sequence, offset) f.unshift(token) f.push(label) unless label.nil? f.join(' ') end |
#label(input, labelled = false) ⇒ Object
Returns an array of label/segment pairs for each line in the passed-in string.
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/anystyle/parser/parser.rb', line 60 def label(input, labelled = false) string = input_to_s(input) model.label(prepare(string, labelled)).map! do |sequence| sequence.inject([]) do |ts, (token, label)| token, label = token[/^\S+/], label.to_sym if (prev = ts[-1]) && prev[0] == label prev[1] << ' ' << token ts else ts << [label, token] end end end end |
#normalize(hash) ⇒ Object
148 149 150 151 152 153 |
# File 'lib/anystyle/parser/parser.rb', line 148 def normalize(hash) hash.keys.each do |label| normalizer.send("normalize_#{label}", hash) end classify hash end |
#parse(input, format = ) ⇒ Object
52 53 54 55 56 57 |
# File 'lib/anystyle/parser/parser.rb', line 52 def parse(input, format = [:format]) formatter = "format_#{format}".to_sym send(formatter, label(input)) rescue NoMethodError raise ArgumentError, "format not supported: #{formatter}" end |
#prepare(input, tagged = false) ⇒ Object
Prepares the passed-in string for processing by a CRF tagger. The string is split into separate lines; each line is tokenized and expanded. Returns an array of sequence arrays that can be labelled by the CRF model.
If the string is marked as being tagged by passing true
as the second argument, training labels will be extracted from the string and appended after feature expansion. The returned sequence arrays can be used for training or testing the CRF model.
118 119 120 121 |
# File 'lib/anystyle/parser/parser.rb', line 118 def prepare(input, tagged = false) string = input_to_s(input) tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| (t,tk,i,l) } } end |
#test(input) ⇒ Object
142 143 144 145 146 |
# File 'lib/anystyle/parser/parser.rb', line 142 def test(input) string = input_to_s(input) model..check! model.label(prepare(string, true)) end |
#tokenize(string, tagged = false) ⇒ Object
Returns an array of tokens for each line of input.
If the passed-in string is marked as being tagged, extracts labels from the string and returns an array of token/label pairs for each line of input.
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/anystyle/parser/parser.rb', line 82 def tokenize(string, tagged = false) if tagged string.split(/[\n\r]+/).each_with_index.map do |s,i| tt, tokens, = s.split([:tagged_separator]), [], [] tt.each do |token| case token when /^$/ # skip when /^<([^\/>][^>]*)>$/ << $1 when /^<\/([^>]+)>$/ unless (tag = .pop) == $1 raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})" end else tokens << [token, ([-1] || :unknown).to_sym] end end tokens end else string.split(/[\n\r]+/).map { |s| s.split([:separator]) } end end |
#train(input, truncate = false) ⇒ Object
133 134 135 136 137 138 139 140 |
# File 'lib/anystyle/parser/parser.rb', line 133 def train(input, truncate = false) string = input_to_s(input) @model = Wapiti::Model.new(:pattern => [:pattern]) if truncate @model.train(prepare(string, true)) @model.compact @model.path = Parser.models[[:model]] @model end |