Class: PROIEL::Converter::CoNLLU::Token

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-u.rb

Constant Summary collapse

MORPHOLOGY_POSITIONAL_TAG_SEQUENCE =
[
  :person, :number, :tense, :mood, :voice, :gender, :case,
  :degree, :strength, :inflection
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence) ⇒ Token



170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/proiel/cli/converters/conll-u.rb', line 170

def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
  @id = id
  @head_id = head_id
  @form = form
  @lemma = lemma
  @part_of_speech = part_of_speech
  @language = language
  @morphology = morphology
  @relation = relation
  @empty_token_sort = empty_token_sort
  @slashes = slashes
  @sentence = sentence
  @features = (morphology ? map_morphology(morphology) : '' )
  @citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_')
  @upos = nil
end

Instance Attribute Details

#citation_partObject (readonly)

Returns the value of attribute citation_part.



168
169
170
# File 'lib/proiel/cli/converters/conll-u.rb', line 168

def citation_part
  @citation_part
end

#empty_token_sortObject (readonly)

Returns the value of attribute empty_token_sort.



166
167
168
# File 'lib/proiel/cli/converters/conll-u.rb', line 166

def empty_token_sort
  @empty_token_sort
end

#formObject (readonly)

Returns the value of attribute form.



167
168
169
# File 'lib/proiel/cli/converters/conll-u.rb', line 167

def form
  @form
end

#head_idObject

Returns the value of attribute head_id.



159
160
161
# File 'lib/proiel/cli/converters/conll-u.rb', line 159

def head_id
  @head_id
end

#idObject (readonly)

Returns the value of attribute id.



163
164
165
# File 'lib/proiel/cli/converters/conll-u.rb', line 163

def id
  @id
end

#languageObject (readonly)

Returns the value of attribute language.



165
166
167
# File 'lib/proiel/cli/converters/conll-u.rb', line 165

def language
  @language
end

#lemmaObject (readonly)

Returns the value of attribute lemma.



164
165
166
# File 'lib/proiel/cli/converters/conll-u.rb', line 164

def lemma
  @lemma
end

#part_of_speechObject (readonly)

Returns the value of attribute part_of_speech.



162
163
164
# File 'lib/proiel/cli/converters/conll-u.rb', line 162

def part_of_speech
  @part_of_speech
end

#relationObject

Returns the value of attribute relation.



161
162
163
# File 'lib/proiel/cli/converters/conll-u.rb', line 161

def relation
  @relation
end

#uposObject

Returns the value of attribute upos.



160
161
162
# File 'lib/proiel/cli/converters/conll-u.rb', line 160

def upos
  @upos
end

Instance Method Details

#add_slash!(slash) ⇒ Object



537
538
539
# File 'lib/proiel/cli/converters/conll-u.rb', line 537

def add_slash!(slash)
  @slashes << slash
end

#adjectival?Boolean

returns true if the node is an adjective or an ordinal



201
202
203
# File 'lib/proiel/cli/converters/conll-u.rb', line 201

def adjectival?
  @part_of_speech == 'A-' or @part_of_speech == 'Mo'
end

#adverb?Boolean



205
206
207
# File 'lib/proiel/cli/converters/conll-u.rb', line 205

def adverb?
  @part_of_speech =~ /\AD/
end

#cardinal?Boolean



209
210
211
# File 'lib/proiel/cli/converters/conll-u.rb', line 209

def cardinal?
  @part_of_speech == 'Ma'
end

#change_coordinations!Object

Changes coordinations recursively from the bottom of the graph



513
514
515
516
# File 'lib/proiel/cli/converters/conll-u.rb', line 513

def change_coordinations!
  dependents.each(&:change_coordinations!)
  process_coordination! if conjunction?
end

#clausal?Boolean

A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute constructino without a verb; or if it is the root (e.g. in a nominal clause)



214
215
216
217
218
219
# File 'lib/proiel/cli/converters/conll-u.rb', line 214

def clausal?
  (@part_of_speech == 'V-' and !nominalized?) or
    dependents.any?(&:copula?) or
    dependents.any? { |d| ['sub', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass'].include? d.relation  } or
    root?
end

#conj_headObject



322
323
324
325
326
327
328
329
# File 'lib/proiel/cli/converters/conll-u.rb', line 322

def conj_head
  raise "Not a conjunct" unless @relation == 'conj'
  if head.relation == 'conj'
    head.conj_head
  else
    head
  end
end

#conjunction?Boolean



221
222
223
# File 'lib/proiel/cli/converters/conll-u.rb', line 221

def conjunction?
  part_of_speech == 'C-' or @empty_token_sort == 'C'
end

#coordinated?Boolean



225
226
227
# File 'lib/proiel/cli/converters/conll-u.rb', line 225

def coordinated?
  head and head.conjunction? and head.relation == @relation
end

#copula?Boolean

Returns true if the node has an xobj dependent and either 1) the lemma is copular or 2) the node is empty and has no pid slash or a pid slash to a node with a copular lemma



232
233
234
235
236
237
# File 'lib/proiel/cli/converters/conll-u.rb', line 232

def copula?
  @relation == 'cop' or 
  (COPULAR_LEMMATA.include?([lemma, part_of_speech, language].join(',')) or
   (@empty_token_sort == 'V' and (pid.nil? or pid.is_empty? or COPULAR_LEMMATA.include?([pid.lemma, pid.part_of_speech, pid.language].join(',')))) and
   dependents.any? { |d| d.relation == 'xobj' } )
end

#count_subgraphObject



310
311
312
# File 'lib/proiel/cli/converters/conll-u.rb', line 310

def count_subgraph
  dependents.map(&:count_subgraph).inject(0, :+) + (is_empty? ? 0 : 1)
end

#dependentsObject



380
381
382
# File 'lib/proiel/cli/converters/conll-u.rb', line 380

def dependents
  @sentence.tokens.select { |t| t.head_id == @id }.sort_by(&:id)
end

#determiner?Boolean



239
240
241
# File 'lib/proiel/cli/converters/conll-u.rb', line 239

def determiner?
  DETERMINERS.include? @part_of_speech
end

#distribute_shared_modifiers!Object



525
526
527
528
529
530
531
532
533
534
535
# File 'lib/proiel/cli/converters/conll-u.rb', line 525

def distribute_shared_modifiers!
  raise "Can only distribute over a conjunction!" unless conjunction?
  conjuncts, modifiers  = dependents.reject { |d| d.relation == 'aux' }.partition { |d|  d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
  first_conjunct = conjuncts.shift
  raise "No first conjunct under #{to_n}\n#{to_graph}" unless first_conjunct
  raise "The first conjunct is a misannotated conjunction in #{to_n}\n#{to_graph}" if first_conjunct.conjunction? and first_conjunct.dependents.empty?
  modifiers.each do |m|
    m.head_id = first_conjunct.id
    conjuncts.each { |c| c.add_slash! [m.id, m.relation] }
  end
end

#ellipsis?Boolean



243
244
245
# File 'lib/proiel/cli/converters/conll-u.rb', line 243

def ellipsis?
  @empty_token_sort == 'V'
end

#find_appositive_headObject



384
385
386
387
388
389
390
391
# File 'lib/proiel/cli/converters/conll-u.rb', line 384

def find_appositive_head
  raise "Not an apposition" unless @relation == 'apos'
  if head.conjunction? and head.relation == 'apos'
    head.find_appositive_head
  else
    head
  end
end

#find_relation(possible_relations) ⇒ Object



393
394
395
396
397
398
399
400
401
402
# File 'lib/proiel/cli/converters/conll-u.rb', line 393

def find_relation possible_relations
  rel, crit = possible_relations.shift
  if rel.nil?
  # raise "Found no relation"
  elsif crit.call self
    @relation = rel
  else
    find_relation possible_relations
  end
end

#find_remnantObject



482
483
484
485
486
487
488
# File 'lib/proiel/cli/converters/conll-u.rb', line 482

def find_remnant
  if r = dependents.select { |d| d.relation == 'remnant' }.first
    r.find_remnant
  else
    self
  end
end

#foreign?Boolean



247
248
249
# File 'lib/proiel/cli/converters/conll-u.rb', line 247

def foreign?
  @part_of_speech == 'F-'
end

#format_features(features) ⇒ Object



339
340
341
342
343
344
345
# File 'lib/proiel/cli/converters/conll-u.rb', line 339

def format_features(features)
  if features == ''
    '_'
  else
    features.split("|").sort.join("|")
  end
end

#has_content?Boolean



251
252
253
# File 'lib/proiel/cli/converters/conll-u.rb', line 251

def has_content?
  @empty_token_sort.nil? or @empty_token_sort == ''
end

#headObject



376
377
378
# File 'lib/proiel/cli/converters/conll-u.rb', line 376

def head
  @sentence.tokens.select { |t| t.id == @head_id }.first
end

#interjection?Boolean



255
256
257
# File 'lib/proiel/cli/converters/conll-u.rb', line 255

def interjection?
  @part_of_speech == 'I-'
end

#invert!(new_dependent_relation = nil, new_head_relation = nil) ⇒ Object

Inverts the direction of a dependency relation. By default the labels are also swapped, but new relations can be specified for both the new dependent and the new head.



544
545
546
547
548
549
550
551
552
553
554
# File 'lib/proiel/cli/converters/conll-u.rb', line 544

def invert!(new_dependent_relation = nil, new_head_relation = nil)
  raise "Cannot promote a token under root!" if @head_id == 0
  new_dependent_relation ||= @relation
  new_head_relation ||= head.relation
  new_head_id = head.head_id

  head.head_id = @id
  head.relation = new_dependent_relation
  @head_id = new_head_id
  self.relation = new_head_relation
end

#is_empty?Boolean



259
260
261
# File 'lib/proiel/cli/converters/conll-u.rb', line 259

def is_empty?
  !has_content?
end

#left_cornerObject



318
319
320
# File 'lib/proiel/cli/converters/conll-u.rb', line 318

def left_corner
  ([self] + dependents).sort_by(&:id).first
end

#map_morphology(morph) ⇒ Object



192
193
194
195
196
197
198
# File 'lib/proiel/cli/converters/conll-u.rb', line 192

def map_morphology morph
res = []
for tag in 0..morph.length - 1
  res << MORPHOLOGY_MAP[MORPHOLOGY_POSITIONAL_TAG_SEQUENCE[tag]][morph[tag]]
end
res.compact.join('|')
end

#map_part_of_speech!Object



404
405
406
407
408
409
410
411
412
413
# File 'lib/proiel/cli/converters/conll-u.rb', line 404

def map_part_of_speech!
  dependents.each(&:map_part_of_speech!)
  @upos = POS_MAP[@part_of_speech].first
  raise "No match found for pos #{part_of_speech.inspect}" unless @upos
  if feat = POS_MAP[@part_of_speech][1]
    @features += ((@features.empty? ? '' : '|') + feat)
  end
  # ugly, but the ugliness comes from UDEP
  @upos = 'ADJ' if @upos == 'DET' and @relation != 'det'
end

#mediopassive?Boolean



263
264
265
# File 'lib/proiel/cli/converters/conll-u.rb', line 263

def mediopassive?
  @morphology[4] =~/[mpe]/
end

#negation?Boolean



267
268
269
# File 'lib/proiel/cli/converters/conll-u.rb', line 267

def negation?
  NEGATION_LEMMATA.include?([lemma, part_of_speech, language].join(','))
end

#nominal?Boolean



271
272
273
# File 'lib/proiel/cli/converters/conll-u.rb', line 271

def nominal?
  @part_of_speech =~ /\A[NPM]/ or nominalized?
end

#nominalized?Boolean



275
276
277
278
279
# File 'lib/proiel/cli/converters/conll-u.rb', line 275

def nominalized?
  dependents.any? do |d|
    d.determiner? and ['atr', 'aux', 'det'].include? d.relation
  end
end

#particle?Boolean



281
282
283
# File 'lib/proiel/cli/converters/conll-u.rb', line 281

def particle?
  @relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
end

#passive?Boolean



285
286
287
# File 'lib/proiel/cli/converters/conll-u.rb', line 285

def passive?
  @morphology[4] == 'p'
end

#pidObject



331
332
333
334
335
336
337
# File 'lib/proiel/cli/converters/conll-u.rb', line 331

def pid
  if pid = @slashes.select { |t, r| r == 'pid' }.first
    @sentence.tokens.select { |t| pid.first == t.id}.first
  else
    nil
  end
end

#preposition?Boolean



289
290
291
# File 'lib/proiel/cli/converters/conll-u.rb', line 289

def preposition?
  @part_of_speech == 'R-'
end

#process_coordination!Object



518
519
520
521
522
523
# File 'lib/proiel/cli/converters/conll-u.rb', line 518

def process_coordination!
  raise "Only coordinations can be processed this way!" unless conjunction?
  return if dependents.reject { |d| d.relation == 'aux' }.empty?
  distribute_shared_modifiers!
  dependents.reject { |d| d.relation == 'aux' }.first.promote!("conj", "cc")
end

#process_copula!Object



490
491
492
493
494
# File 'lib/proiel/cli/converters/conll-u.rb', line 490

def process_copula!
  predicates = dependents.select { |d| d.relation == 'xobj' }
  raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1
  predicates.first.promote!(nil, 'cop')
end

#process_ellipsis!Object

TODO: process “implicit pid” through APOS chain too



453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
# File 'lib/proiel/cli/converters/conll-u.rb', line 453

def process_ellipsis!
  # First we find the corresponding overt token.
  # If there's an explicit pid slash, we'll grab that one.
  if pid and !subgraph_set.include?(pid)
    overt = pid
  # otherwise, try a conjunct
  elsif @relation == 'conj'
    overt = conj_head
  elsif @relation == 'apos'
    overt = find_appositive_head
  else
    return
  end

  dependents.each do |d|
    # check if there's a partner with the same relation under the overt node.
    # TODO: this isn't really very convincing when it comes to ADVs
    if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
      partner = partner.find_remnant
      d.head_id = partner.id
      d.relation = 'remnant'
    # if there's no partner, just attach under the overt node, preserving the relation
    else
      d.head_id = overt.id
    end
  end
  @sentence.remove_token!(self)
end

#process_preposition!Object



496
497
498
499
500
501
502
# File 'lib/proiel/cli/converters/conll-u.rb', line 496

def process_preposition!
  raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-'
  obliques = dependents.select { |d| d.relation == 'obl' }
  raise "#{obliques.size} oblique dependents under #{to_n}\n#{to_graph}" if obliques.size > 1
  return if obliques.empty? #shouldn't really happen, but in practice
  obliques.first.invert!("case") # , "adv")
end

#process_subjunction!Object

attach subjunctions with ‘mark’ under their verbs and promote the verb to take over the subjunction’s relation. If the verb is empty, the subjunction stays as head.



433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
# File 'lib/proiel/cli/converters/conll-u.rb', line 433

def process_subjunction!
  # ignore if the subjunction has no dependents or only conj dependents.
  # NB: this requires that the function is called *after* processing conjunctions
  return if dependents.reject { |d| ['conj', 'cc'].include? d.relation }.empty?
  pred = dependents.select { |d| d.relation == 'pred' }
  raise "#{pred.size} PREDs under the subjunction #{to_n}:\n#{@sentence.to_graph}" unless pred.one?
  pred = pred.first
  # promote the subjunction if the verb is empty
  if pred.is_empty?
  pred.dependents.each { |d| d.head_id = id }
  @sentence.remove_token! pred
  # else demote the subjunction
  else
    pred.invert!('mark')
  end
end

#promote!(new_sibling_relation = nil, new_dependent_relation = 'aux') ⇒ Object

promotes a node to its head’s place. The node takes over its former head’s relation and all dependents. The new relation for these dependents can be specified; if it is not, they will keep their former relation. The former head is made a dependent of the node (with a specified relation) or, if it is an empty node, destroyed.



563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
# File 'lib/proiel/cli/converters/conll-u.rb', line 563

def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux')
  raise "Cannot promote a token under root!" if @head_id == 0
  new_head_relation = head.relation
  new_head_id = head.head_id

  # move all dependents of the former head to the new one
  siblings.each do |t|
    t.head_id = @id
    # ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings)
    t.relation = new_sibling_relation if (new_sibling_relation and t.relation != 'aux')
  end

  # remove the former head if it was empty
  if head.is_empty?
    @sentence.remove_token!(head)
  # else make it a dependent of the new head
  else
    head.head_id = @id
    head.relation = new_dependent_relation
  end

  @head_id = new_head_id
  # don't use relation=, as we don't want this relation to be
  # copied down a tree of conjunctions
  @relation = new_head_relation
end

#proper_noun?Boolean



293
294
295
# File 'lib/proiel/cli/converters/conll-u.rb', line 293

def proper_noun?
  @part_of_speech == 'Ne'
end

#relabel_graph!Object



415
416
417
418
419
420
421
422
423
424
425
426
427
428
# File 'lib/proiel/cli/converters/conll-u.rb', line 415

def relabel_graph!
  dependents.each(&:relabel_graph!)
  possible_relations = RELATION_MAPPING[@relation]
  case possible_relations
  when String
    @relation = possible_relations
  when Array
    find_relation possible_relations.dup
  when nil
  # do nothing: the token has already changed its relation
  else
    raise "Unknown value #{possible_relations.inspect} for #{@relation}"
  end
end

#remove_empties!Object



504
505
506
507
508
509
510
# File 'lib/proiel/cli/converters/conll-u.rb', line 504

def remove_empties!
  dependents.each(&:remove_empties!)
  if is_empty?
    dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' }
    @sentence.remove_token! self
  end
end

#root?Boolean



297
298
299
# File 'lib/proiel/cli/converters/conll-u.rb', line 297

def root?
  @head_id == 0
end

#siblingsObject



372
373
374
# File 'lib/proiel/cli/converters/conll-u.rb', line 372

def siblings
  @sentence.tokens.select { |t| t.head_id == @head_id } - [self]
end

#subgraph_setObject



314
315
316
# File 'lib/proiel/cli/converters/conll-u.rb', line 314

def subgraph_set
  [self] + dependents.map(&:subgraph_set).flatten
end

#to_conllObject



347
348
349
350
351
352
353
354
355
356
357
358
# File 'lib/proiel/cli/converters/conll-u.rb', line 347

def to_conll
  [@id, 
   @form, 
   @lemma, 
   @upos, 
   @part_of_speech, 
   format_features(@features), 
   @head_id, 
   (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
   '_', # slashes here
   @citation_part].join("\t")
end

#to_graph(indents = 0) ⇒ Object



368
369
370
# File 'lib/proiel/cli/converters/conll-u.rb', line 368

def to_graph(indents = 0)
  ([("\t" * indents) + (to_n)] + dependents.map { |d| d.to_graph(indents + 1) }).join("\n")
end

#to_nObject



364
365
366
# File 'lib/proiel/cli/converters/conll-u.rb', line 364

def to_n
  [@relation, @id, (@form || @empty_token_sort), (@upos || @part_of_speech) ].join('-')
end

#to_sObject



360
361
362
# File 'lib/proiel/cli/converters/conll-u.rb', line 360

def to_s
  [@id, @form, @head_id, @relation].join("\t")
end