Class: PROIEL::Converter::CoNLLX

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-x.rb

Overview

This converts to the CoNLL-X format as described on ilk.uvt.nl/conll/#dataformat.

Class Method Summary collapse

Class Method Details

.find_lexical_head_and_relation(id_to_number, id_to_token, t, rel = '') ⇒ Object



54
55
56
57
58
59
60
61
62
# File 'lib/proiel/cli/converters/conll-x.rb', line 54

def find_lexical_head_and_relation(id_to_number, id_to_token, t, rel = '')
  if t.is_root?
    [0, rel + t.relation] # FIXME: may be empty token anyway
  elsif id_to_token[t.head_id].has_content?
    [id_to_number[t.head_id], rel + t.relation]
  else
    find_lexical_head_and_relation(id_to_number, id_to_token, id_to_token[t.head_id], rel + "#{t.relation}(#{id_to_number[t.head_id]})")
  end
end

.format_morphology(token) ⇒ Object



43
44
45
46
47
48
49
50
51
52
# File 'lib/proiel/cli/converters/conll-x.rb', line 43

def format_morphology(token)
  token.morphology_hash.map do |k, v|
    # Remove inflection tag unless when set to inflecting
    if k == :inflection and v =='i'
      nil
    else
      "#{k.upcase[0..3]}#{v}"
    end
  end.compact.join('|')
end

.process(tb, options) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/proiel/cli/converters/conll-x.rb', line 6

def process(tb, options)
  tb.sources.each do |source|
    source.divs.each do |div|
      div.sentences.each do |sentence|
        id_to_number = {}

        # Do not care about prodrop tokens
        tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
        
        # Renumber to make the sequence continguous after prodrop tokens where left out
        tk.map(&:id).each_with_index.each do |id, i|
          id_to_number[id] = i + 1
        end

        id_to_token = tk.inject({}) { |h, t| h.merge({t.id => t}) }

        tk.each do |token|
          unless token.is_empty?
            this_number = id_to_number[token.id]
            head_number, relation = find_lexical_head_and_relation(id_to_number, id_to_token, token)
            form = token.form.gsub(/[[:space:]]/, '.')
            lemma = token.lemma.gsub(/[[:space:]]/, '.')
            pos_major = token.part_of_speech_hash[:major]
            pos_full = token.part_of_speech
            morphology = format_morphology(token)

            puts [this_number, form, lemma, pos_major, pos_full,
                  morphology, head_number, relation, "_", "_"].join("\t")
          end
        end

        puts
      end
    end
  end
end