Class: PROIEL::Converter::CoNLLU::Sentence
- Inherits:
-
Object
- Object
- PROIEL::Converter::CoNLLU::Sentence
- Defined in:
- lib/proiel/cli/converters/conll-u.rb
Instance Attribute Summary collapse
-
#tokens ⇒ Object
Returns the value of attribute tokens.
Instance Method Summary collapse
- #check_directionality! ⇒ Object
- #convert ⇒ Object
- #count_tokens ⇒ Object
- #demote_parentheticals_and_vocatives! ⇒ Object
- #demote_subjunctions! ⇒ Object
- #distribute_conjunctions! ⇒ Object
- #find_token(identifier) ⇒ Object
-
#initialize(sentence) ⇒ Sentence
constructor
initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence.
- #map_part_of_speech! ⇒ Object
-
#prune_empty_rootnodes! ⇒ Object
TODO: this will leave several root nodes in many cases.
- #relabel_graph! ⇒ Object
- #remove_token!(token) ⇒ Object
- #restructure_graph! ⇒ Object
- #roots ⇒ Object
- #to_conll ⇒ Object
- #to_graph ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(sentence) ⇒ Sentence
initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 45 def initialize(sentence) id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil # initialize array to hold the sentence tokens tks = [] # keep track of how many new tokens have been created offset = 0 sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk| if tk.form =~ /[[:space:]]/ subtoks = tk.form.split(/[[:space:]]/) subtoks.each_with_index do |subtok, i| tks << PROIEL::Token.new(sentence, (i == 0 ? tk.id : 1000 + offset), # id (i == 0 ? tk.head_id : tk.id), # head_id subtok, # hope the lemmas split the same way as the tokens. Grab the form if you don't find a lemma (tk.lemma.split(/[[:space:]]/)[i] || subtok), tk.part_of_speech, # copy the postag tk.morphology, (i == 0 ? tk.relation : 'fixed'), nil, #empty_token_sort tk.citation_part, (i == 0 ? tk.presentation_before : nil), (i == (subtoks.size - 1) ? tk.presentation_after : nil), (i == 0 ? tk.antecedent_id : nil), (i == 0 ? tk.information_status : nil), (i == 0 ? tk.contrast_group : nil), (i == 0 ? tk.foreign_ids : nil), (i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), # This needs to be given a real slash object for the initialization, although it throws away the info (subtok == subtoks.first ? tk.alignment_id : nil) ) offset += 1 end else tks << tk end end tks.map(&:id).each_with_index.each do |id, i| id_to_number[id] = i + 1 end @tokens = tks.map do |t| Token.new(id_to_number[t.id], id_to_number[t.head_id], #insert dots in any whitespace inside words and lemmata t.form.to_s.gsub(/[[:space:]]/, '.'), t.lemma.to_s.gsub(/[[:space:]]/, '.'), t.part_of_speech, t.language, t.morphology, t.relation, t.empty_token_sort, t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] }, t.citation_part, self ) end end |
Instance Attribute Details
#tokens ⇒ Object
Returns the value of attribute tokens.
42 43 44 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 42 def tokens @tokens end |
Instance Method Details
#check_directionality! ⇒ Object
133 134 135 136 137 138 139 140 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 133 def check_directionality! @tokens.select { |t| ['fixed', 'flat:foreign', 'flat:name'].include? t.relation }.each do |f| f.promote!(nil, f.relation) if f.id < f.head.id end @tokens.select { |t| t.relation == 'conj' }.each do |f| raise "conj must go left-to-right (id: #{f.id}, head_id: #{f.head.id}, form: #{f.form}, head_form: #{f.head.form})" if f.id < f.head.id end end |
#convert ⇒ Object
111 112 113 114 115 116 117 118 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 111 def convert restructure_graph! relabel_graph! check_directionality! distribute_conjunctions! map_part_of_speech! self end |
#count_tokens ⇒ Object
154 155 156 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 154 def count_tokens roots.map(&:count_subgraph).inject(0, :+) end |
#demote_parentheticals_and_vocatives! ⇒ Object
192 193 194 195 196 197 198 199 200 201 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 192 def demote_parentheticals_and_vocatives! r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation } if p.any? and r.none? # promote the first vocative/parenthetical to head in case there's nothing else p.first.relation = 'pred' r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation } end raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one? p.each { |x| x.head_id = r.first.id } end |
#demote_subjunctions! ⇒ Object
188 189 190 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 188 def demote_subjunctions! @tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!) end |
#distribute_conjunctions! ⇒ Object
120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 120 def distribute_conjunctions! @tokens.select { |t| t.has_conjunct? }.each do |h| conjuncts = h.dependents.select { |d| d.relation == 'conj' } conjunctions = h.dependents.select { |d| d.relation == 'cc' } conjunctions.each do |c| if c.id > h.id new_head = conjuncts.select { |cj| cj.id > c.id }.first c.head_id = new_head.id if new_head end end end end |
#find_token(identifier) ⇒ Object
142 143 144 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 142 def find_token(identifier) @tokens.select { |t| t.id == identifier }.first end |
#map_part_of_speech! ⇒ Object
207 208 209 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 207 def map_part_of_speech! roots.each(&:map_part_of_speech!) end |
#prune_empty_rootnodes! ⇒ Object
TODO: this will leave several root nodes in many cases. For now, raise an error
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 171 def prune_empty_rootnodes! unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty? empty_roots.each do |r| # promote xobj to root if there is one xobjs = r.dependents.select { |d| d.relation == 'xobj' } if xobjs.any? new_root = xobjs.first new_root.head_id = 0 new_root.relation = r.relation r.dependents.each { |d| d.head_id = new_root.id } remove_token! r end end #prune_empty_rootnodes! end end |
#relabel_graph! ⇒ Object
203 204 205 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 203 def relabel_graph! roots.each(&:relabel_graph!) end |
#remove_token!(token) ⇒ Object
146 147 148 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 146 def remove_token!(token) @tokens.delete(token) end |
#restructure_graph! ⇒ Object
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 211 def restructure_graph! @tokens.delete_if { |n| n.empty_token_sort == 'P' } @tokens.select(&:preposition?).each(&:process_preposition!) @tokens.select { |t| t.comparison_word? and t.dependents and t.dependents.select { |d| ['sub','obj','obl','comp','adv'].include?(d.relation) }.any? }.each(&:process_comparison!) roots.each(&:change_coordinations!) @tokens.select(&:copula?).each(&:process_copula!) demote_subjunctions! prune_empty_rootnodes! # do ellipses from left to right for proper remnant treatment @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!) #NB! apos gets overridden by process_comparison so some dislocations are lost @tokens.select { |t| t.relation == 'apos' and t.id < t.head_id }.each(&:process_dislocation!) # DIRTY: remove the rest of the empty nodes by attaching them # to their grandmother with remnant. This is the best way to # do it given the current state of the UDEP scheme, but # revisions will come. roots.each(&:remove_empties!) demote_parentheticals_and_vocatives! end |
#roots ⇒ Object
158 159 160 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 158 def roots @tokens.select { |t| t.head_id == 0 }.sort_by(&:id) end |
#to_conll ⇒ Object
166 167 168 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 166 def to_conll @tokens.map(&:to_conll).join("\n") end |
#to_graph ⇒ Object
162 163 164 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 162 def to_graph roots.map(&:to_graph).join("\n") end |
#to_s ⇒ Object
150 151 152 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 150 def to_s @tokens.map(&:to_s).join("\n") end |