Class: PROIEL::Converter::CoNLLU::Sentence

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-u.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sentence) ⇒ Sentence

initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/proiel/cli/converters/conll-u.rb', line 45

def initialize(sentence)

  id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil

  # initialize array to hold the sentence tokens
  tks = []
  # keep track of how many new tokens have been created
  offset = 0

  sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk|

    if tk.form =~ /[[:space:]]/
      subtoks = tk.form.split(/[[:space:]]/)

      subtoks.each_with_index do |subtok, i|
        tks << PROIEL::Token.new(sentence,
                         (i == 0 ? tk.id : 1000 + offset), # id
                         (i == 0 ? tk.head_id : tk.id), # head_id
                         subtok,
                         # hope the lemmas split the same way as the tokens. Grab the form if you don't find a lemma
                         (tk.lemma.split(/[[:space:]]/)[i] || subtok),
                         tk.part_of_speech, # copy the postag
                         tk.morphology,
                         (i == 0 ? tk.relation : 'fixed'),
                         nil, #empty_token_sort
                         tk.citation_part,
                         (i == 0 ? tk.presentation_before : nil),
                         (i == (subtoks.size - 1)  ? tk.presentation_after : nil),
                         (i == 0 ? tk.antecedent_id : nil),
                         (i == 0 ? tk.information_status : nil),
                         (i == 0 ? tk.contrast_group : nil),
                         (i == 0 ? tk.foreign_ids : nil),
                         (i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), #  This needs to be given a real slash object for the initialization, although it throws away the info
                         (subtok == subtoks.first ? tk.alignment_id : nil)
                        )
        offset += 1
      end
    else
      tks << tk
    end
  end


  tks.map(&:id).each_with_index.each do |id, i|
    id_to_number[id] = i + 1
  end

  @tokens = tks.map do |t|

    Token.new(id_to_number[t.id],
              id_to_number[t.head_id],
              #insert dots in any whitespace inside words and lemmata
              t.form.to_s.gsub(/[[:space:]]/, '.'),
              t.lemma.to_s.gsub(/[[:space:]]/, '.'),
              t.part_of_speech,
              t.language,
              t.morphology,
              t.relation,
              t.empty_token_sort,
              t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
              t.citation_part,
              self
             )
  end
end

Instance Attribute Details

#tokensObject

Returns the value of attribute tokens.



42
43
44
# File 'lib/proiel/cli/converters/conll-u.rb', line 42

def tokens
  @tokens
end

Instance Method Details

#check_directionality!Object



133
134
135
136
137
138
139
140
# File 'lib/proiel/cli/converters/conll-u.rb', line 133

def check_directionality!
  @tokens.select { |t| ['fixed', 'flat:foreign', 'flat:name'].include? t.relation }.each do |f|
    f.promote!(nil, f.relation) if f.id < f.head.id
  end
  @tokens.select { |t| t.relation == 'conj' }.each do |f|
    raise "conj must go left-to-right (id: #{f.id}, head_id: #{f.head.id}, form: #{f.form}, head_form: #{f.head.form})" if f.id < f.head.id
  end
end

#convertObject



111
112
113
114
115
116
117
118
# File 'lib/proiel/cli/converters/conll-u.rb', line 111

def convert
  restructure_graph!
  relabel_graph!
  check_directionality!
  distribute_conjunctions!
  map_part_of_speech!
  self
end

#count_tokensObject



154
155
156
# File 'lib/proiel/cli/converters/conll-u.rb', line 154

def count_tokens
  roots.map(&:count_subgraph).inject(0, :+)
end

#demote_parentheticals_and_vocatives!Object



192
193
194
195
196
197
198
199
200
201
# File 'lib/proiel/cli/converters/conll-u.rb', line 192

def demote_parentheticals_and_vocatives!
  r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  if p.any? and r.none?
    # promote the first vocative/parenthetical to head in case there's nothing else
    p.first.relation = 'pred'
    r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  end
  raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
  p.each { |x| x.head_id = r.first.id }
end

#demote_subjunctions!Object



188
189
190
# File 'lib/proiel/cli/converters/conll-u.rb', line 188

def demote_subjunctions!
  @tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!)
end

#distribute_conjunctions!Object



120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/proiel/cli/converters/conll-u.rb', line 120

def distribute_conjunctions!
  @tokens.select { |t| t.has_conjunct? }.each do |h|
    conjuncts = h.dependents.select { |d| d.relation == 'conj' }
    conjunctions = h.dependents.select { |d| d.relation == 'cc' }
    conjunctions.each do |c|
      if c.id > h.id
        new_head = conjuncts.select { |cj| cj.id > c.id }.first
        c.head_id = new_head.id if new_head
      end
    end
  end
end

#find_token(identifier) ⇒ Object



142
143
144
# File 'lib/proiel/cli/converters/conll-u.rb', line 142

def find_token(identifier)
  @tokens.select { |t| t.id == identifier }.first
end

#map_part_of_speech!Object



207
208
209
# File 'lib/proiel/cli/converters/conll-u.rb', line 207

def map_part_of_speech!
  roots.each(&:map_part_of_speech!)
end

#prune_empty_rootnodes!Object

TODO: this will leave several root nodes in many cases. For now, raise an error



171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/proiel/cli/converters/conll-u.rb', line 171

def prune_empty_rootnodes!
  unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty?
    empty_roots.each do |r|
      # promote xobj to  root if there is one
      xobjs = r.dependents.select { |d| d.relation == 'xobj' }
      if xobjs.any?
        new_root = xobjs.first
        new_root.head_id = 0
        new_root.relation = r.relation
        r.dependents.each { |d| d.head_id = new_root.id }
        remove_token! r
      end
    end
    #prune_empty_rootnodes!
  end
end

#relabel_graph!Object



203
204
205
# File 'lib/proiel/cli/converters/conll-u.rb', line 203

def relabel_graph!
  roots.each(&:relabel_graph!)
end

#remove_token!(token) ⇒ Object



146
147
148
# File 'lib/proiel/cli/converters/conll-u.rb', line 146

def remove_token!(token)
  @tokens.delete(token)
end

#restructure_graph!Object



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/proiel/cli/converters/conll-u.rb', line 211

def restructure_graph!
  @tokens.delete_if { |n| n.empty_token_sort == 'P' }
  @tokens.select(&:preposition?).each(&:process_preposition!)
  @tokens.select { |t| t.comparison_word? and t.dependents and t.dependents.select { |d|  ['sub','obj','obl','comp','adv'].include?(d.relation) }.any? }.each(&:process_comparison!)
  roots.each(&:change_coordinations!)
  @tokens.select(&:copula?).each(&:process_copula!)
  demote_subjunctions!
  prune_empty_rootnodes!
  # do ellipses from left to right for proper remnant treatment
  @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
  #NB! apos gets overridden by process_comparison so some dislocations are lost
  @tokens.select { |t| t.relation == 'apos' and t.id < t.head_id }.each(&:process_dislocation!)
  # DIRTY: remove the rest of the empty nodes by attaching them
  # to their grandmother with remnant. This is the best way to
  # do it given the current state of the UDEP scheme, but
  # revisions will come.
  roots.each(&:remove_empties!)
  demote_parentheticals_and_vocatives!
end

#rootsObject



158
159
160
# File 'lib/proiel/cli/converters/conll-u.rb', line 158

def roots
  @tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
end

#to_conllObject



166
167
168
# File 'lib/proiel/cli/converters/conll-u.rb', line 166

def to_conll
  @tokens.map(&:to_conll).join("\n")
end

#to_graphObject



162
163
164
# File 'lib/proiel/cli/converters/conll-u.rb', line 162

def to_graph
  roots.map(&:to_graph).join("\n")
end

#to_sObject



150
151
152
# File 'lib/proiel/cli/converters/conll-u.rb', line 150

def to_s
  @tokens.map(&:to_s).join("\n")
end