Class: PROIEL::Converter::CoNLLX

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-x.rb

Overview

Converter that outputs the CoNLL-X format as described on ilk.uvt.nl/conll/#dataformat.

The conversion removes empty tokens. PRO tokens are completely ignored, while null C and null V tokens are eliminated by attaching their dependents to the first non-null ancestor and labelling them with a concatenation of dependency relations.

Sequences of whitespace in forms and lemmas are represented by ‘.’.

Class Method Summary collapse

Class Method Details

.find_lexical_head_and_relation(id_map, tb, t, rel = '') ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
# File 'lib/proiel/cli/converters/conll-x.rb', line 68

def find_lexical_head_and_relation(id_map, tb, t, rel = '')
  new_relation = rel + t.relation

  if t.is_root?
    [0, new_relation]
  elsif t.head.has_content?
    [id_map[t.head], new_relation]
  else
    find_lexical_head_and_relation(id_map, tb, t.head, "#{new_relation}(#{id_map[t.head]})")
  end
end

.format_morphology(token) ⇒ Object



57
58
59
60
61
62
63
64
65
66
# File 'lib/proiel/cli/converters/conll-x.rb', line 57

def format_morphology(token)
  token.morphology_hash.map do |k, v|
    # Remove inflection tag except when set to inflecting
    if k == :inflection and v =='i'
      nil
    else
      "#{k.upcase[0..3]}#{v}"
    end
  end.compact.join('|')
end

.format_pos(token) ⇒ Object



53
54
55
# File 'lib/proiel/cli/converters/conll-x.rb', line 53

def format_pos(token)
  [token.part_of_speech_hash[:major], token.part_of_speech]
end

.format_text(s) ⇒ Object



49
50
51
# File 'lib/proiel/cli/converters/conll-x.rb', line 49

def format_text(s)
  s.gsub(/[[:space:]]+/, '.')
end

.process(tb, _) ⇒ Object



13
14
15
16
17
18
19
# File 'lib/proiel/cli/converters/conll-x.rb', line 13

def process(tb, _)
  tb.sources.each do |source|
    source.sentences.each do |sentence|
      process_sentence(tb, sentence)
    end
  end
end

.process_sentence(tb, sentence) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/proiel/cli/converters/conll-x.rb', line 21

def process_sentence(tb, sentence)
  tokens = sentence.tokens

  # Generate 1-based continguous numbering of overt tokens with
  # null V and null C tokens appended at the end. We do this
  # manually to ensure that the numbering is correct whatever the
  # sequence is in the treebank.
  id_map = Hash.new { |h, k| h[k] = h.keys.length + 1 }
  tokens.select(&:has_content?).each { |t| id_map[t] } # these blocks have side-effects
  tokens.reject(&:has_content?).reject(&:pro?).each { |t| id_map[t] }

  # Iterate overt tokens and print one formatted line per token.
  tokens.select(&:has_content?).each do |token|
    this_number = id_map[token]
    head_number, relation = find_lexical_head_and_relation(id_map, tb, token)
    form = format_text(token.form)
    lemma = format_text(token.lemma)
    pos_major, pos_full = format_pos(token)
    morphology = format_morphology(token)

    puts [this_number, form, lemma, pos_major, pos_full,
          morphology, head_number, relation, '_', '_'].join("\t")
  end

  # Separate sentences by an empty line.
  puts
end