Class: PROIEL::Converter::CoNLLU::Token
- Inherits:
-
Object
- Object
- PROIEL::Converter::CoNLLU::Token
- Defined in:
- lib/proiel/cli/converters/conll-u.rb
Constant Summary collapse
- MORPHOLOGY_POSITIONAL_TAG_SEQUENCE =
[ :person, :number, :tense, :mood, :voice, :gender, :case, :degree, :strength, :inflection ]
Instance Attribute Summary collapse
-
#citation_part ⇒ Object
readonly
Returns the value of attribute citation_part.
-
#empty_token_sort ⇒ Object
readonly
Returns the value of attribute empty_token_sort.
-
#form ⇒ Object
readonly
Returns the value of attribute form.
-
#head_id ⇒ Object
Returns the value of attribute head_id.
-
#id ⇒ Object
readonly
Returns the value of attribute id.
-
#language ⇒ Object
readonly
Returns the value of attribute language.
-
#lemma ⇒ Object
readonly
Returns the value of attribute lemma.
-
#part_of_speech ⇒ Object
readonly
Returns the value of attribute part_of_speech.
-
#relation ⇒ Object
Returns the value of attribute relation.
-
#upos ⇒ Object
Returns the value of attribute upos.
Instance Method Summary collapse
- #add_slash!(slash) ⇒ Object
-
#adjectival? ⇒ Boolean
returns
trueif the node is an adjective or an ordinal. - #adverb? ⇒ Boolean
- #cardinal? ⇒ Boolean
-
#change_coordinations! ⇒ Object
Changes coordinations recursively from the bottom of the graph.
-
#clausal? ⇒ Boolean
A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute constructino without a verb; or if it is the root (e.g. in a nominal clause).
- #conj_head ⇒ Object
- #conjunction? ⇒ Boolean
- #coordinated? ⇒ Boolean
-
#copula? ⇒ Boolean
Returns
trueif the node has an xobj dependent and either 1) the lemma is copular or 2) the node is empty and has no pid slash or a pid slash to a node with a copular lemma. - #count_subgraph ⇒ Object
- #dependents ⇒ Object
- #determiner? ⇒ Boolean
- #distribute_shared_modifiers! ⇒ Object
- #ellipsis? ⇒ Boolean
- #find_appositive_head ⇒ Object
- #find_relation(possible_relations) ⇒ Object
- #find_remnant ⇒ Object
- #foreign? ⇒ Boolean
- #format_features(features) ⇒ Object
- #has_content? ⇒ Boolean
- #head ⇒ Object
-
#initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence) ⇒ Token
constructor
A new instance of Token.
- #interjection? ⇒ Boolean
-
#invert!(new_dependent_relation = nil, new_head_relation = nil) ⇒ Object
Inverts the direction of a dependency relation.
- #is_empty? ⇒ Boolean
- #left_corner ⇒ Object
- #map_morphology(morph) ⇒ Object
- #map_part_of_speech! ⇒ Object
- #mediopassive? ⇒ Boolean
- #negation? ⇒ Boolean
- #nominal? ⇒ Boolean
- #nominalized? ⇒ Boolean
- #particle? ⇒ Boolean
- #passive? ⇒ Boolean
- #pid ⇒ Object
- #preposition? ⇒ Boolean
- #process_coordination! ⇒ Object
- #process_copula! ⇒ Object
-
#process_ellipsis! ⇒ Object
TODO: process “implicit pid” through APOS chain too.
- #process_preposition! ⇒ Object
-
#process_subjunction! ⇒ Object
attach subjunctions with ‘mark’ under their verbs and promote the verb to take over the subjunction’s relation.
-
#promote!(new_sibling_relation = nil, new_dependent_relation = 'aux') ⇒ Object
promotes a node to its head’s place.
- #proper_noun? ⇒ Boolean
- #relabel_graph! ⇒ Object
- #remove_empties! ⇒ Object
- #root? ⇒ Boolean
- #siblings ⇒ Object
- #subgraph_set ⇒ Object
- #to_conll ⇒ Object
- #to_graph(indents = 0) ⇒ Object
- #to_n ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence) ⇒ Token
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 170 def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence) @id = id @head_id = head_id @form = form @lemma = lemma @part_of_speech = part_of_speech @language = language @morphology = morphology @relation = relation @empty_token_sort = empty_token_sort @slashes = slashes @sentence = sentence @features = (morphology ? map_morphology(morphology) : '' ) @citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_') @upos = nil end |
Instance Attribute Details
#citation_part ⇒ Object (readonly)
Returns the value of attribute citation_part.
168 169 170 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 168 def citation_part @citation_part end |
#empty_token_sort ⇒ Object (readonly)
Returns the value of attribute empty_token_sort.
166 167 168 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 166 def empty_token_sort @empty_token_sort end |
#form ⇒ Object (readonly)
Returns the value of attribute form.
167 168 169 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 167 def form @form end |
#head_id ⇒ Object
Returns the value of attribute head_id.
159 160 161 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 159 def head_id @head_id end |
#id ⇒ Object (readonly)
Returns the value of attribute id.
163 164 165 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 163 def id @id end |
#language ⇒ Object (readonly)
Returns the value of attribute language.
165 166 167 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 165 def language @language end |
#lemma ⇒ Object (readonly)
Returns the value of attribute lemma.
164 165 166 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 164 def lemma @lemma end |
#part_of_speech ⇒ Object (readonly)
Returns the value of attribute part_of_speech.
162 163 164 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 162 def part_of_speech @part_of_speech end |
#relation ⇒ Object
Returns the value of attribute relation.
161 162 163 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 161 def relation @relation end |
#upos ⇒ Object
Returns the value of attribute upos.
160 161 162 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 160 def upos @upos end |
Instance Method Details
#add_slash!(slash) ⇒ Object
537 538 539 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 537 def add_slash!(slash) @slashes << slash end |
#adjectival? ⇒ Boolean
returns true if the node is an adjective or an ordinal
201 202 203 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 201 def adjectival? @part_of_speech == 'A-' or @part_of_speech == 'Mo' end |
#adverb? ⇒ Boolean
205 206 207 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 205 def adverb? @part_of_speech =~ /\AD/ end |
#cardinal? ⇒ Boolean
209 210 211 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 209 def cardinal? @part_of_speech == 'Ma' end |
#change_coordinations! ⇒ Object
Changes coordinations recursively from the bottom of the graph
513 514 515 516 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 513 def change_coordinations! dependents.each(&:change_coordinations!) process_coordination! if conjunction? end |
#clausal? ⇒ Boolean
A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute constructino without a verb; or if it is the root (e.g. in a nominal clause)
214 215 216 217 218 219 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 214 def clausal? (@part_of_speech == 'V-' and !nominalized?) or dependents.any?(&:copula?) or dependents.any? { |d| ['sub', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass'].include? d.relation } or root? end |
#conj_head ⇒ Object
322 323 324 325 326 327 328 329 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 322 def conj_head raise "Not a conjunct" unless @relation == 'conj' if head.relation == 'conj' head.conj_head else head end end |
#conjunction? ⇒ Boolean
221 222 223 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 221 def conjunction? part_of_speech == 'C-' or @empty_token_sort == 'C' end |
#coordinated? ⇒ Boolean
225 226 227 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 225 def coordinated? head and head.conjunction? and head.relation == @relation end |
#copula? ⇒ Boolean
Returns true if the node has an xobj dependent and either 1) the lemma is copular or 2) the node is empty and has no pid slash or a pid slash to a node with a copular lemma
232 233 234 235 236 237 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 232 def copula? @relation == 'cop' or (COPULAR_LEMMATA.include?([lemma, part_of_speech, language].join(',')) or (@empty_token_sort == 'V' and (pid.nil? or pid.is_empty? or COPULAR_LEMMATA.include?([pid.lemma, pid.part_of_speech, pid.language].join(',')))) and dependents.any? { |d| d.relation == 'xobj' } ) end |
#count_subgraph ⇒ Object
310 311 312 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 310 def count_subgraph dependents.map(&:count_subgraph).inject(0, :+) + (is_empty? ? 0 : 1) end |
#dependents ⇒ Object
380 381 382 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 380 def dependents @sentence.tokens.select { |t| t.head_id == @id }.sort_by(&:id) end |
#determiner? ⇒ Boolean
239 240 241 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 239 def determiner? DETERMINERS.include? @part_of_speech end |
#distribute_shared_modifiers! ⇒ Object
525 526 527 528 529 530 531 532 533 534 535 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 525 def distribute_shared_modifiers! raise "Can only distribute over a conjunction!" unless conjunction? conjuncts, modifiers = dependents.reject { |d| d.relation == 'aux' }.partition { |d| d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') } first_conjunct = conjuncts.shift raise "No first conjunct under #{to_n}\n#{to_graph}" unless first_conjunct raise "The first conjunct is a misannotated conjunction in #{to_n}\n#{to_graph}" if first_conjunct.conjunction? and first_conjunct.dependents.empty? modifiers.each do |m| m.head_id = first_conjunct.id conjuncts.each { |c| c.add_slash! [m.id, m.relation] } end end |
#ellipsis? ⇒ Boolean
243 244 245 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 243 def ellipsis? @empty_token_sort == 'V' end |
#find_appositive_head ⇒ Object
384 385 386 387 388 389 390 391 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 384 def find_appositive_head raise "Not an apposition" unless @relation == 'apos' if head.conjunction? and head.relation == 'apos' head.find_appositive_head else head end end |
#find_relation(possible_relations) ⇒ Object
393 394 395 396 397 398 399 400 401 402 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 393 def find_relation possible_relations rel, crit = possible_relations.shift if rel.nil? # raise "Found no relation" elsif crit.call self @relation = rel else find_relation possible_relations end end |
#find_remnant ⇒ Object
482 483 484 485 486 487 488 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 482 def find_remnant if r = dependents.select { |d| d.relation == 'remnant' }.first r.find_remnant else self end end |
#foreign? ⇒ Boolean
247 248 249 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 247 def foreign? @part_of_speech == 'F-' end |
#format_features(features) ⇒ Object
339 340 341 342 343 344 345 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 339 def format_features(features) if features == '' '_' else features.split("|").sort.join("|") end end |
#has_content? ⇒ Boolean
251 252 253 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 251 def has_content? @empty_token_sort.nil? or @empty_token_sort == '' end |
#head ⇒ Object
376 377 378 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 376 def head @sentence.tokens.select { |t| t.id == @head_id }.first end |
#interjection? ⇒ Boolean
255 256 257 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 255 def interjection? @part_of_speech == 'I-' end |
#invert!(new_dependent_relation = nil, new_head_relation = nil) ⇒ Object
Inverts the direction of a dependency relation. By default the labels are also swapped, but new relations can be specified for both the new dependent and the new head.
544 545 546 547 548 549 550 551 552 553 554 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 544 def invert!(new_dependent_relation = nil, new_head_relation = nil) raise "Cannot promote a token under root!" if @head_id == 0 new_dependent_relation ||= @relation new_head_relation ||= head.relation new_head_id = head.head_id head.head_id = @id head.relation = new_dependent_relation @head_id = new_head_id self.relation = new_head_relation end |
#is_empty? ⇒ Boolean
259 260 261 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 259 def is_empty? !has_content? end |
#left_corner ⇒ Object
318 319 320 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 318 def left_corner ([self] + dependents).sort_by(&:id).first end |
#map_morphology(morph) ⇒ Object
192 193 194 195 196 197 198 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 192 def map_morphology morph res = [] for tag in 0..morph.length - 1 res << MORPHOLOGY_MAP[MORPHOLOGY_POSITIONAL_TAG_SEQUENCE[tag]][morph[tag]] end res.compact.join('|') end |
#map_part_of_speech! ⇒ Object
404 405 406 407 408 409 410 411 412 413 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 404 def map_part_of_speech! dependents.each(&:map_part_of_speech!) @upos = POS_MAP[@part_of_speech].first raise "No match found for pos #{part_of_speech.inspect}" unless @upos if feat = POS_MAP[@part_of_speech][1] @features += ((@features.empty? ? '' : '|') + feat) end # ugly, but the ugliness comes from UDEP @upos = 'ADJ' if @upos == 'DET' and @relation != 'det' end |
#mediopassive? ⇒ Boolean
263 264 265 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 263 def mediopassive? @morphology[4] =~/[mpe]/ end |
#negation? ⇒ Boolean
267 268 269 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 267 def negation? NEGATION_LEMMATA.include?([lemma, part_of_speech, language].join(',')) end |
#nominal? ⇒ Boolean
271 272 273 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 271 def nominal? @part_of_speech =~ /\A[NPM]/ or nominalized? end |
#nominalized? ⇒ Boolean
275 276 277 278 279 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 275 def nominalized? dependents.any? do |d| d.determiner? and ['atr', 'aux', 'det'].include? d.relation end end |
#particle? ⇒ Boolean
281 282 283 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 281 def particle? @relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(',')) end |
#passive? ⇒ Boolean
285 286 287 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 285 def passive? @morphology[4] == 'p' end |
#pid ⇒ Object
331 332 333 334 335 336 337 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 331 def pid if pid = @slashes.select { |t, r| r == 'pid' }.first @sentence.tokens.select { |t| pid.first == t.id}.first else nil end end |
#preposition? ⇒ Boolean
289 290 291 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 289 def preposition? @part_of_speech == 'R-' end |
#process_coordination! ⇒ Object
518 519 520 521 522 523 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 518 def process_coordination! raise "Only coordinations can be processed this way!" unless conjunction? return if dependents.reject { |d| d.relation == 'aux' }.empty? distribute_shared_modifiers! dependents.reject { |d| d.relation == 'aux' }.first.promote!("conj", "cc") end |
#process_copula! ⇒ Object
490 491 492 493 494 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 490 def process_copula! predicates = dependents.select { |d| d.relation == 'xobj' } raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1 predicates.first.promote!(nil, 'cop') end |
#process_ellipsis! ⇒ Object
TODO: process “implicit pid” through APOS chain too
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 453 def process_ellipsis! # First we find the corresponding overt token. # If there's an explicit pid slash, we'll grab that one. if pid and !subgraph_set.include?(pid) overt = pid # otherwise, try a conjunct elsif @relation == 'conj' overt = conj_head elsif @relation == 'apos' overt = find_appositive_head else return end dependents.each do |d| # check if there's a partner with the same relation under the overt node. # TODO: this isn't really very convincing when it comes to ADVs if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self partner = partner.find_remnant d.head_id = partner.id d.relation = 'remnant' # if there's no partner, just attach under the overt node, preserving the relation else d.head_id = overt.id end end @sentence.remove_token!(self) end |
#process_preposition! ⇒ Object
496 497 498 499 500 501 502 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 496 def process_preposition! raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-' obliques = dependents.select { |d| d.relation == 'obl' } raise "#{obliques.size} oblique dependents under #{to_n}\n#{to_graph}" if obliques.size > 1 return if obliques.empty? #shouldn't really happen, but in practice obliques.first.invert!("case") # , "adv") end |
#process_subjunction! ⇒ Object
attach subjunctions with ‘mark’ under their verbs and promote the verb to take over the subjunction’s relation. If the verb is empty, the subjunction stays as head.
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 433 def process_subjunction! # ignore if the subjunction has no dependents or only conj dependents. # NB: this requires that the function is called *after* processing conjunctions return if dependents.reject { |d| ['conj', 'cc'].include? d.relation }.empty? pred = dependents.select { |d| d.relation == 'pred' } raise "#{pred.size} PREDs under the subjunction #{to_n}:\n#{@sentence.to_graph}" unless pred.one? pred = pred.first # promote the subjunction if the verb is empty if pred.is_empty? pred.dependents.each { |d| d.head_id = id } @sentence.remove_token! pred # else demote the subjunction else pred.invert!('mark') end end |
#promote!(new_sibling_relation = nil, new_dependent_relation = 'aux') ⇒ Object
promotes a node to its head’s place. The node takes over its former head’s relation and all dependents. The new relation for these dependents can be specified; if it is not, they will keep their former relation. The former head is made a dependent of the node (with a specified relation) or, if it is an empty node, destroyed.
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 563 def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux') raise "Cannot promote a token under root!" if @head_id == 0 new_head_relation = head.relation new_head_id = head.head_id # move all dependents of the former head to the new one siblings.each do |t| t.head_id = @id # ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings) t.relation = new_sibling_relation if (new_sibling_relation and t.relation != 'aux') end # remove the former head if it was empty if head.is_empty? @sentence.remove_token!(head) # else make it a dependent of the new head else head.head_id = @id head.relation = new_dependent_relation end @head_id = new_head_id # don't use relation=, as we don't want this relation to be # copied down a tree of conjunctions @relation = new_head_relation end |
#proper_noun? ⇒ Boolean
293 294 295 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 293 def proper_noun? @part_of_speech == 'Ne' end |
#relabel_graph! ⇒ Object
415 416 417 418 419 420 421 422 423 424 425 426 427 428 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 415 def relabel_graph! dependents.each(&:relabel_graph!) possible_relations = RELATION_MAPPING[@relation] case possible_relations when String @relation = possible_relations when Array find_relation possible_relations.dup when nil # do nothing: the token has already changed its relation else raise "Unknown value #{possible_relations.inspect} for #{@relation}" end end |
#remove_empties! ⇒ Object
504 505 506 507 508 509 510 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 504 def remove_empties! dependents.each(&:remove_empties!) if is_empty? dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' } @sentence.remove_token! self end end |
#root? ⇒ Boolean
297 298 299 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 297 def root? @head_id == 0 end |
#siblings ⇒ Object
372 373 374 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 372 def siblings @sentence.tokens.select { |t| t.head_id == @head_id } - [self] end |
#subgraph_set ⇒ Object
314 315 316 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 314 def subgraph_set [self] + dependents.map(&:subgraph_set).flatten end |
#to_conll ⇒ Object
347 348 349 350 351 352 353 354 355 356 357 358 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 347 def to_conll [@id, @form, @lemma, @upos, @part_of_speech, format_features(@features), @head_id, (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc '_', # slashes here @citation_part].join("\t") end |
#to_graph(indents = 0) ⇒ Object
368 369 370 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 368 def to_graph(indents = 0) ([("\t" * indents) + (to_n)] + dependents.map { |d| d.to_graph(indents + 1) }).join("\n") end |
#to_n ⇒ Object
364 365 366 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 364 def to_n [@relation, @id, (@form || @empty_token_sort), (@upos || @part_of_speech) ].join('-') end |
#to_s ⇒ Object
360 361 362 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 360 def to_s [@id, @form, @head_id, @relation].join("\t") end |