Class: PROIEL::Converter::CoNLLU::Token

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-u.rb

Constant Summary collapse

MORPHOLOGY_POSITIONAL_TAG_SEQUENCE =
[
  :person, :number, :tense, :mood, :voice, :gender, :case,
  :degree, :strength, :inflection
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence) ⇒ Token

Returns a new instance of Token.



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/proiel/cli/converters/conll-u.rb', line 218

def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
  @id = id
  @head_id = head_id
  @form = form
  @lemma = lemma
  @part_of_speech = part_of_speech
  @language = language
  @morphology = morphology
  @relation = relation
  @empty_token_sort = empty_token_sort
  @slashes = slashes
  @sentence = sentence
  @features = (morphology ? map_morphology(morphology) : '' )
  @citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_')
  @upos = nil
end

Instance Attribute Details

#citation_partObject (readonly)

Returns the value of attribute citation_part.



216
217
218
# File 'lib/proiel/cli/converters/conll-u.rb', line 216

def citation_part
  @citation_part
end

#empty_token_sortObject (readonly)

Returns the value of attribute empty_token_sort.



214
215
216
# File 'lib/proiel/cli/converters/conll-u.rb', line 214

def empty_token_sort
  @empty_token_sort
end

#formObject (readonly)

Returns the value of attribute form.



215
216
217
# File 'lib/proiel/cli/converters/conll-u.rb', line 215

def form
  @form
end

#head_idObject

Returns the value of attribute head_id.



207
208
209
# File 'lib/proiel/cli/converters/conll-u.rb', line 207

def head_id
  @head_id
end

#idObject (readonly)

Returns the value of attribute id.



211
212
213
# File 'lib/proiel/cli/converters/conll-u.rb', line 211

def id
  @id
end

#languageObject (readonly)

Returns the value of attribute language.



213
214
215
# File 'lib/proiel/cli/converters/conll-u.rb', line 213

def language
  @language
end

#lemmaObject (readonly)

Returns the value of attribute lemma.



212
213
214
# File 'lib/proiel/cli/converters/conll-u.rb', line 212

def lemma
  @lemma
end

#part_of_speechObject (readonly)

Returns the value of attribute part_of_speech.



210
211
212
# File 'lib/proiel/cli/converters/conll-u.rb', line 210

def part_of_speech
  @part_of_speech
end

#relationObject

Returns the value of attribute relation.



209
210
211
# File 'lib/proiel/cli/converters/conll-u.rb', line 209

def relation
  @relation
end

#uposObject

Returns the value of attribute upos.



208
209
210
# File 'lib/proiel/cli/converters/conll-u.rb', line 208

def upos
  @upos
end

Instance Method Details

#add_slash!(slash) ⇒ Object



626
627
628
# File 'lib/proiel/cli/converters/conll-u.rb', line 626

def add_slash!(slash)
  @slashes << slash
end

#adjectival?Boolean

returns true if the node is an adjective or an ordinal

Returns:

  • (Boolean)


253
254
255
# File 'lib/proiel/cli/converters/conll-u.rb', line 253

def adjectival?
  @part_of_speech == 'A-' or @part_of_speech == 'Mo'
end

#adverb?Boolean

Returns:

  • (Boolean)


261
262
263
# File 'lib/proiel/cli/converters/conll-u.rb', line 261

def adverb?
  @part_of_speech =~ /\AD/
end

#auxiliary?Boolean

Returns:

  • (Boolean)


295
296
297
# File 'lib/proiel/cli/converters/conll-u.rb', line 295

def auxiliary?
  AUXILIARIES.include?([lemma, part_of_speech, language].join(','))
end

#cardinal?Boolean

Returns:

  • (Boolean)


265
266
267
# File 'lib/proiel/cli/converters/conll-u.rb', line 265

def cardinal?
  @part_of_speech == 'Ma'
end

#change_coordinations!Object

Changes coordinations recursively from the bottom of the graph



602
603
604
605
# File 'lib/proiel/cli/converters/conll-u.rb', line 602

def change_coordinations!
  dependents.each(&:change_coordinations!)
  process_coordination! if conjunction?
end

#clausal?Boolean

A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute constructino without a verb; or if it is the root (e.g. in a nominal clause)

Returns:

  • (Boolean)


270
271
272
273
274
275
# File 'lib/proiel/cli/converters/conll-u.rb', line 270

def clausal?
  (@part_of_speech == 'V-' and !nominalized?) or
    dependents.any?(&:copula?) or
    dependents.any? { |d| ['sub', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass'].include? d.relation  } or
    root?
end

#conj_headObject



394
395
396
397
398
399
400
401
# File 'lib/proiel/cli/converters/conll-u.rb', line 394

def conj_head
  raise "Not a conjunct" unless @relation == 'conj'
  if head.relation == 'conj'
    head.conj_head
  else
    head
  end
end

#conjunction?Boolean

Returns:

  • (Boolean)


277
278
279
# File 'lib/proiel/cli/converters/conll-u.rb', line 277

def conjunction?
  part_of_speech == 'C-' or @empty_token_sort == 'C'
end

#coordinated?Boolean

Returns:

  • (Boolean)


281
282
283
# File 'lib/proiel/cli/converters/conll-u.rb', line 281

def coordinated?
  head and head.conjunction? and head.relation == @relation
end

#copula?Boolean

Returns true if the node has an xobj dependent and either 1) the lemma is copular or 2) the node is empty and has no pid slash or a pid slash to a node with a copular lemma

Returns:

  • (Boolean)


288
289
290
291
292
293
# File 'lib/proiel/cli/converters/conll-u.rb', line 288

def copula?
  @relation == 'cop' or 
  (COPULAR_LEMMATA.include?([lemma, part_of_speech, language].join(',')) or
   (@empty_token_sort == 'V' and (pid.nil? or pid.is_empty? or COPULAR_LEMMATA.include?([pid.lemma, pid.part_of_speech, pid.language].join(',')))) and
   dependents.any? { |d| d.relation == 'xobj' } )
end

#count_subgraphObject



382
383
384
# File 'lib/proiel/cli/converters/conll-u.rb', line 382

def count_subgraph
  dependents.map(&:count_subgraph).inject(0, :+) + (is_empty? ? 0 : 1)
end

#dependentsObject



452
453
454
# File 'lib/proiel/cli/converters/conll-u.rb', line 452

def dependents
  @sentence.tokens.select { |t| t.head_id == @id }.sort_by(&:id)
end

#deponent?Boolean

Returns:

  • (Boolean)


323
324
325
# File 'lib/proiel/cli/converters/conll-u.rb', line 323

def deponent?
  DEPONENTS[@language] and DEPONENTS[@language].match(@lemma)
end

#determiner?Boolean

Returns:

  • (Boolean)


299
300
301
# File 'lib/proiel/cli/converters/conll-u.rb', line 299

def determiner?
  DETERMINERS.include? @part_of_speech
end

#distribute_shared_modifiers!Object



614
615
616
617
618
619
620
621
622
623
624
# File 'lib/proiel/cli/converters/conll-u.rb', line 614

def distribute_shared_modifiers!
  raise "Can only distribute over a conjunction!" unless conjunction?
  conjuncts, modifiers  = dependents.reject { |d| d.relation == 'aux' }.partition { |d|  d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
  first_conjunct = conjuncts.shift
  raise "No first conjunct under #{to_n}\n#{to_graph}" unless first_conjunct
  raise "The first conjunct is a misannotated conjunction in #{to_n}\n#{to_graph}" if first_conjunct.conjunction? and first_conjunct.dependents.empty?
  modifiers.each do |m|
    m.head_id = first_conjunct.id
    conjuncts.each { |c| c.add_slash! [m.id, m.relation] }
  end
end

#ellipsis?Boolean

Returns:

  • (Boolean)


303
304
305
# File 'lib/proiel/cli/converters/conll-u.rb', line 303

def ellipsis?
  @empty_token_sort == 'V'
end

#find_appositive_headObject



456
457
458
459
460
461
462
463
# File 'lib/proiel/cli/converters/conll-u.rb', line 456

def find_appositive_head
  raise "Not an apposition" unless @relation == 'apos'
  if head.conjunction? and head.relation == 'apos'
    head.find_appositive_head
  else
    head
  end
end

#find_highest_daughterObject



571
572
573
# File 'lib/proiel/cli/converters/conll-u.rb', line 571

def find_highest_daughter
  dependents.min_by { |d| OBLIQUENESS_HIERARCHY.find_index(d.map_relation[/[^:]*/]) || 1000 }
end

#find_postag(possible_postags) ⇒ Object



465
466
467
468
469
470
471
472
473
474
475
# File 'lib/proiel/cli/converters/conll-u.rb', line 465

def find_postag possible_postags
  tag, crit, feats = possible_postags.shift
  if tag.nil?
    # raise "Found no postag"
  elsif crit.call self
    @upos = tag
    @features += ((@features.empty? ? '' : '|') + feats) if feats
  else
    find_postag possible_postags
  end
end

#find_relation(possible_relations) ⇒ Object



477
478
479
480
481
482
483
484
485
486
# File 'lib/proiel/cli/converters/conll-u.rb', line 477

def find_relation possible_relations
  rel, crit = possible_relations.shift
  if rel.nil?
  # raise "Found no relation"
  elsif crit.call self
    rel
  else
    find_relation possible_relations
  end
end

#find_remnantObject



563
564
565
566
567
568
569
# File 'lib/proiel/cli/converters/conll-u.rb', line 563

def find_remnant
  if r = dependents.select { |d| d.relation == 'remnant' }.first
    r.find_remnant
  else
    self
  end
end

#foreign?Boolean

Returns:

  • (Boolean)


307
308
309
# File 'lib/proiel/cli/converters/conll-u.rb', line 307

def foreign?
  @part_of_speech == 'F-'
end

#format_features(features) ⇒ Object



411
412
413
414
415
416
417
# File 'lib/proiel/cli/converters/conll-u.rb', line 411

def format_features(features)
  if features == ''
    '_'
  else
    features.split("|").sort.join("|")
  end
end

#genitive?Boolean

Returns:

  • (Boolean)


248
249
250
# File 'lib/proiel/cli/converters/conll-u.rb', line 248

def genitive?
  @morphology =~ /......g.*/
end

#has_content?Boolean

Returns:

  • (Boolean)


311
312
313
# File 'lib/proiel/cli/converters/conll-u.rb', line 311

def has_content?
  @empty_token_sort.nil? or @empty_token_sort == ''
end

#has_preposition?Boolean

Returns:

  • (Boolean)


581
582
583
# File 'lib/proiel/cli/converters/conll-u.rb', line 581

def has_preposition?
  dependents.any? { |d| d.preposition? and d.relation == "case" }
end

#headObject



448
449
450
# File 'lib/proiel/cli/converters/conll-u.rb', line 448

def head
  @sentence.tokens.select { |t| t.id == @head_id }.first
end

#interjection?Boolean

Returns:

  • (Boolean)


315
316
317
# File 'lib/proiel/cli/converters/conll-u.rb', line 315

def interjection?
  @part_of_speech == 'I-'
end

#invert!(new_dependent_relation = nil, new_head_relation = nil) ⇒ Object

Inverts the direction of a dependency relation. By default the labels are also swapped, but new relations can be specified for both the new dependent and the new head.



633
634
635
636
637
638
639
640
641
642
643
# File 'lib/proiel/cli/converters/conll-u.rb', line 633

def invert!(new_dependent_relation = nil, new_head_relation = nil)
  raise "Cannot promote a token under root!" if @head_id == 0
  new_dependent_relation ||= @relation
  new_head_relation ||= head.relation
  new_head_id = head.head_id

  head.head_id = @id
  head.relation = new_dependent_relation
  @head_id = new_head_id
  self.relation = new_head_relation
end

#is_empty?Boolean

Returns:

  • (Boolean)


319
320
321
# File 'lib/proiel/cli/converters/conll-u.rb', line 319

def is_empty?
  !has_content?
end

#left_cornerObject



390
391
392
# File 'lib/proiel/cli/converters/conll-u.rb', line 390

def left_corner
  ([self] + dependents).sort_by(&:id).first
end

#map_morphology(morph) ⇒ Object



240
241
242
243
244
245
246
# File 'lib/proiel/cli/converters/conll-u.rb', line 240

def map_morphology morph
res = []
for tag in 0..morph.length - 1
  res << MORPHOLOGY_MAP[MORPHOLOGY_POSITIONAL_TAG_SEQUENCE[tag]][morph[tag]]
end
res.compact.join('|')
end

#map_part_of_speech!Object



488
489
490
491
492
493
494
# File 'lib/proiel/cli/converters/conll-u.rb', line 488

def map_part_of_speech!
  dependents.each(&:map_part_of_speech!)
  possible_postags = POS_MAP[@part_of_speech]
  find_postag possible_postags.dup
  # ugly, but the ugliness comes from UDEP
  @upos = 'ADJ' if @upos == 'DET' and @relation != 'det'
end

#map_relationObject



503
504
505
506
507
508
509
510
511
512
513
514
515
516
# File 'lib/proiel/cli/converters/conll-u.rb', line 503

def map_relation
  possible_relations = RELATION_MAPPING[@relation]
  case possible_relations
  when String
    possible_relations
  when Array
    x = find_relation possible_relations.dup
  when nil
    # do nothing: the token has already changed its relation
    @relation
  else
    raise "Unknown value #{possible_relations.inspect} for #{@relation}"
  end
end

#mediopassive?Boolean

Returns:

  • (Boolean)


327
328
329
# File 'lib/proiel/cli/converters/conll-u.rb', line 327

def mediopassive?
  (!deponent? and @morphology) ? @morphology[4] =~/[mpe]/ : false
end

#negation?Boolean

Returns:

  • (Boolean)


335
336
337
# File 'lib/proiel/cli/converters/conll-u.rb', line 335

def negation?
  NEGATION_LEMMATA.include?([lemma, part_of_speech, language].join(','))
end

#nominal?Boolean

Returns:

  • (Boolean)


339
340
341
# File 'lib/proiel/cli/converters/conll-u.rb', line 339

def nominal?
  @part_of_speech =~ /\A[NPM]/ or nominalized?
end

#nominalized?Boolean

Returns:

  • (Boolean)


343
344
345
346
347
# File 'lib/proiel/cli/converters/conll-u.rb', line 343

def nominalized?
  dependents.any? do |d|
    d.determiner? and ['atr', 'aux', 'det'].include? d.relation
  end
end

#particle?Boolean

Returns:

  • (Boolean)


353
354
355
# File 'lib/proiel/cli/converters/conll-u.rb', line 353

def particle?
  @relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
end

#passive?Boolean

Returns:

  • (Boolean)


331
332
333
# File 'lib/proiel/cli/converters/conll-u.rb', line 331

def passive?
  (!deponent? and @morphology) ? @morphology[4] == 'p' : false
end

#pidObject



403
404
405
406
407
408
409
# File 'lib/proiel/cli/converters/conll-u.rb', line 403

def pid
  if pid = @slashes.select { |t, r| r == 'pid' }.first
    @sentence.tokens.select { |t| pid.first == t.id}.first
  else
    nil
  end
end

#preposition?Boolean

Returns:

  • (Boolean)


361
362
363
# File 'lib/proiel/cli/converters/conll-u.rb', line 361

def preposition?
  @part_of_speech == 'R-'
end

#process_coordination!Object



607
608
609
610
611
612
# File 'lib/proiel/cli/converters/conll-u.rb', line 607

def process_coordination!
  raise "Only coordinations can be processed this way!" unless conjunction?
  return if dependents.reject { |d| d.relation == 'aux' }.empty?
  distribute_shared_modifiers!
  dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id }.first.promote!("conj", "cc")
end

#process_copula!Object



575
576
577
578
579
# File 'lib/proiel/cli/converters/conll-u.rb', line 575

def process_copula!
  predicates = dependents.select { |d| d.relation == 'xobj' }
  raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1
  predicates.first.promote!(nil, 'cop')
end

#process_ellipsis!Object



538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
# File 'lib/proiel/cli/converters/conll-u.rb', line 538

def process_ellipsis!
  aux = dependents.select(&:auxiliary?).first
  if aux
    aux.promote! 
    return
  end

  new_head = find_highest_daughter
  new_head.promote!('orphan')
  
#          dependents.each do |d|
    # check if there's a partner with the same relation under the overt node.
    # TODO: this isn't really very convincing when it comes to ADVs
#            if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
#              partner = partner.find_remnant
#              d.head_id = partner.id
#              d.relation = 'remnant'
    # if there's no partner, just attach under the overt node, preserving the relation
#            else
#              d.head_id = overt.id
#            end
#          end
  @sentence.remove_token!(self)
end

#process_preposition!Object



585
586
587
588
589
590
591
# File 'lib/proiel/cli/converters/conll-u.rb', line 585

def process_preposition!
  raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-'
  obliques = dependents.select { |d| d.relation == 'obl' }
  raise "#{obliques.size} oblique dependents under #{to_n}\n#{to_graph}" if obliques.size > 1
  return if obliques.empty? #shouldn't really happen, but in practice
  obliques.first.invert!("case") # , "adv")
end

#process_subjunction!Object

attach subjunctions with ‘mark’ under their verbs and promote the verb to take over the subjunction’s relation. If the verb is empty, the subjunction stays as head.



521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
# File 'lib/proiel/cli/converters/conll-u.rb', line 521

def process_subjunction!
  # ignore if the subjunction has no dependents or only conj dependents.
  # NB: this requires that the function is called *after* processing conjunctions
  return if dependents.reject { |d| ['conj', 'cc'].include? d.relation }.empty?
  pred = dependents.select { |d| d.relation == 'pred' }
  raise "#{pred.size} PREDs under the subjunction #{to_n}:\n#{@sentence.to_graph}" unless pred.one?
  pred = pred.first
  # promote the subjunction if the verb is empty
  if pred.is_empty?
  pred.dependents.each { |d| d.head_id = id }
  @sentence.remove_token! pred
  # else demote the subjunction
  else
    pred.invert!('mark')
  end
end

#promote!(new_sibling_relation = nil, new_dependent_relation = 'aux') ⇒ Object

promotes a node to its head’s place. The node takes over its former head’s relation and all dependents. The new relation for these dependents can be specified; if it is not, they will keep their former relation. The former head is made a dependent of the node (with a specified relation) or, if it is an empty node, destroyed.



652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
# File 'lib/proiel/cli/converters/conll-u.rb', line 652

def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux')
  raise "Cannot promote a token under root!" if @head_id == 0
  new_head_relation = head.relation
  new_head_id = head.head_id

  # move all dependents of the former head to the new one
  siblings.each do |t|
    t.head_id = @id
    # ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings)
    t.relation = new_sibling_relation if (new_sibling_relation and t.relation != 'aux')
  end

  # remove the former head if it was empty
  if head.is_empty?
    @sentence.remove_token!(head)
  # else make it a dependent of the new head
  else
    head.head_id = @id
    head.relation = new_dependent_relation
  end

  @head_id = new_head_id
  # don't use relation=, as we don't want this relation to be
  # copied down a tree of conjunctions
  @relation = new_head_relation
end

#pronominal?Boolean

Returns:

  • (Boolean)


357
358
359
# File 'lib/proiel/cli/converters/conll-u.rb', line 357

def pronominal?
  @part_of_speech =~ /\AP[^st]/ # no evidence that possessives are pronoun/determiner-like
end

#proper_noun?Boolean

Returns:

  • (Boolean)


365
366
367
# File 'lib/proiel/cli/converters/conll-u.rb', line 365

def proper_noun?
  @part_of_speech == 'Ne'
end

#relabel_graph!Object



496
497
498
499
500
501
# File 'lib/proiel/cli/converters/conll-u.rb', line 496

def relabel_graph!
  dependents.each(&:relabel_graph!)
  # TODO: if there are iobjs without an obj among the dependents, one of them should be promoted to obj
  @relation = map_relation
  raise "No relation for #{form}" unless @relation
end

#remove_empties!Object



593
594
595
596
597
598
599
# File 'lib/proiel/cli/converters/conll-u.rb', line 593

def remove_empties!
  dependents.each(&:remove_empties!)
  if is_empty?
    dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' }
    @sentence.remove_token! self
  end
end

#root?Boolean

Returns:

  • (Boolean)


369
370
371
# File 'lib/proiel/cli/converters/conll-u.rb', line 369

def root?
  @head_id == 0
end

#siblingsObject



444
445
446
# File 'lib/proiel/cli/converters/conll-u.rb', line 444

def siblings
  @sentence.tokens.select { |t| t.head_id == @head_id } - [self]
end

#subgraph_setObject



386
387
388
# File 'lib/proiel/cli/converters/conll-u.rb', line 386

def subgraph_set
  [self] + dependents.map(&:subgraph_set).flatten
end

#subjunction?Boolean

Returns:

  • (Boolean)


257
258
259
# File 'lib/proiel/cli/converters/conll-u.rb', line 257

def subjunction?
  @part_of_speech == 'G-'
end

#TAM_particle?Boolean

Returns:

  • (Boolean)


349
350
351
# File 'lib/proiel/cli/converters/conll-u.rb', line 349

def TAM_particle?
  @relation == 'aux' and TAM_PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
end

#to_conllObject



419
420
421
422
423
424
425
426
427
428
429
430
# File 'lib/proiel/cli/converters/conll-u.rb', line 419

def to_conll
  [@id, 
   @form, 
   @lemma, 
   @upos, 
   @part_of_speech, 
   format_features(@features), 
   @head_id, 
   (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
   '_', # slashes here
   @citation_part].join("\t")
end

#to_graph(indents = 0) ⇒ Object



440
441
442
# File 'lib/proiel/cli/converters/conll-u.rb', line 440

def to_graph(indents = 0)
  ([("\t" * indents) + (to_n)] + dependents.map { |d| d.to_graph(indents + 1) }).join("\n")
end

#to_nObject



436
437
438
# File 'lib/proiel/cli/converters/conll-u.rb', line 436

def to_n
  [@relation, @id, (@form || @empty_token_sort), (@upos || @part_of_speech) ].join('-')
end

#to_sObject



432
433
434
# File 'lib/proiel/cli/converters/conll-u.rb', line 432

def to_s
  [@id, @form, @head_id, @relation].join("\t")
end