Class: PROIEL::Converter::CoNLLU::Sentence

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-u.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sentence) ⇒ Sentence

initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/proiel/cli/converters/conll-u.rb', line 39

def initialize(sentence)

  id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil

  tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
  
  tk.map(&:id).each_with_index.each do |id, i|
    id_to_number[id] = i + 1
  end

  @tokens = tk.map do |t|
    Token.new(id_to_number[t.id],
              id_to_number[t.head_id],
              t.form.to_s.gsub(/[[:space:]]/, '.'),
              t.lemma.to_s.gsub(/[[:space:]]/, '.'),
              t.part_of_speech,
              t.language,
              t.morphology,
              t.relation,
              t.empty_token_sort,
              t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
              t.citation_part,
              self
             )
  end
end

Instance Attribute Details

#tokensObject

Returns the value of attribute tokens.



36
37
38
# File 'lib/proiel/cli/converters/conll-u.rb', line 36

def tokens
  @tokens
end

Instance Method Details

#convertObject



66
67
68
69
70
71
# File 'lib/proiel/cli/converters/conll-u.rb', line 66

def convert
  restructure_graph!
  relabel_graph!
  map_part_of_speech!
  self
end

#count_tokensObject



85
86
87
# File 'lib/proiel/cli/converters/conll-u.rb', line 85

def count_tokens
  roots.map(&:count_subgraph).inject(0, :+)
end

#demote_parentheticals_and_vocatives!Object



120
121
122
123
124
125
126
127
128
129
# File 'lib/proiel/cli/converters/conll-u.rb', line 120

def demote_parentheticals_and_vocatives!
  r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  if p.any? and r.none?
    # promote the first vocative/parenthetical to head in case there's nothing else
    p.first.relation = 'pred'
    r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  end
  raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
  p.each { |x| x.head_id = r.first.id }
end

#demote_subjunctions!Object



116
117
118
# File 'lib/proiel/cli/converters/conll-u.rb', line 116

def demote_subjunctions!
  @tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!)
end

#find_token(identifier) ⇒ Object



73
74
75
# File 'lib/proiel/cli/converters/conll-u.rb', line 73

def find_token(identifier)
  @tokens.select { |t| t.id == identifier }.first
end

#map_part_of_speech!Object



135
136
137
# File 'lib/proiel/cli/converters/conll-u.rb', line 135

def map_part_of_speech!
  roots.each(&:map_part_of_speech!)
end

#prune_empty_rootnodes!Object

TODO: this will leave several root nodes in many cases. For now, raise an error



102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/proiel/cli/converters/conll-u.rb', line 102

def prune_empty_rootnodes!
  unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty?
    empty_roots.each do |r|
      # promote the first dependent to root
      new_root = r.dependents.first
      new_root.head_id = 0
      new_root.relation = r.relation
      r.dependents.each { |d| d.head_id = new_root.id }
      remove_token! r
    end
    prune_empty_rootnodes!
  end
end

#relabel_graph!Object



131
132
133
# File 'lib/proiel/cli/converters/conll-u.rb', line 131

def relabel_graph!
  roots.each(&:relabel_graph!)
end

#remove_token!(token) ⇒ Object



77
78
79
# File 'lib/proiel/cli/converters/conll-u.rb', line 77

def remove_token!(token)
  @tokens.delete(token)
end

#restructure_graph!Object



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/proiel/cli/converters/conll-u.rb', line 139

def restructure_graph!
  @tokens.delete_if { |n| n.empty_token_sort == 'P' }
  @tokens.select(&:preposition?).each(&:process_preposition!)
  roots.each(&:change_coordinations!)
  @tokens.select(&:copula?).each(&:process_copula!)
  prune_empty_rootnodes!
  # do ellipses from left to right for proper remnant treatment
  @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
  demote_subjunctions!
  # DIRTY: remove the rest of the empty nodes by attaching them
  # to their grandmother with remnant. This is the best way to
  # do it given the current state of the UDEP scheme, but
  # revisions will come.
  roots.each(&:remove_empties!)
  demote_parentheticals_and_vocatives!
end

#rootsObject



89
90
91
# File 'lib/proiel/cli/converters/conll-u.rb', line 89

def roots
  @tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
end

#to_conllObject



97
98
99
# File 'lib/proiel/cli/converters/conll-u.rb', line 97

def to_conll
  @tokens.map(&:to_conll).join("\n")
end

#to_graphObject



93
94
95
# File 'lib/proiel/cli/converters/conll-u.rb', line 93

def to_graph
  roots.map(&:to_graph).join("\n")
end

#to_sObject



81
82
83
# File 'lib/proiel/cli/converters/conll-u.rb', line 81

def to_s
  @tokens.map(&:to_s).join("\n")
end