Class: PROIEL::Converter::CoNLLU::Sentence
- Inherits:
-
Object
- Object
- PROIEL::Converter::CoNLLU::Sentence
- Defined in:
- lib/proiel/cli/converters/conll-u.rb
Instance Attribute Summary collapse
-
#tokens ⇒ Object
Returns the value of attribute tokens.
Instance Method Summary collapse
- #convert ⇒ Object
- #count_tokens ⇒ Object
- #demote_parentheticals_and_vocatives! ⇒ Object
- #demote_subjunctions! ⇒ Object
- #find_token(identifier) ⇒ Object
-
#initialize(sentence) ⇒ Sentence
constructor
initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence.
- #map_part_of_speech! ⇒ Object
-
#prune_empty_rootnodes! ⇒ Object
TODO: this will leave several root nodes in many cases.
- #relabel_graph! ⇒ Object
- #remove_token!(token) ⇒ Object
- #restructure_graph! ⇒ Object
- #roots ⇒ Object
- #to_conll ⇒ Object
- #to_graph ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(sentence) ⇒ Sentence
initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 39 def initialize(sentence) id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' } tk.map(&:id).each_with_index.each do |id, i| id_to_number[id] = i + 1 end @tokens = tk.map do |t| Token.new(id_to_number[t.id], id_to_number[t.head_id], t.form.to_s.gsub(/[[:space:]]/, '.'), t.lemma.to_s.gsub(/[[:space:]]/, '.'), t.part_of_speech, t.language, t.morphology, t.relation, t.empty_token_sort, t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] }, t.citation_part, self ) end end |
Instance Attribute Details
#tokens ⇒ Object
Returns the value of attribute tokens.
36 37 38 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 36 def tokens @tokens end |
Instance Method Details
#convert ⇒ Object
66 67 68 69 70 71 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 66 def convert restructure_graph! relabel_graph! map_part_of_speech! self end |
#count_tokens ⇒ Object
85 86 87 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 85 def count_tokens roots.map(&:count_subgraph).inject(0, :+) end |
#demote_parentheticals_and_vocatives! ⇒ Object
120 121 122 123 124 125 126 127 128 129 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 120 def demote_parentheticals_and_vocatives! r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation } if p.any? and r.none? # promote the first vocative/parenthetical to head in case there's nothing else p.first.relation = 'pred' r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation } end raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one? p.each { |x| x.head_id = r.first.id } end |
#demote_subjunctions! ⇒ Object
116 117 118 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 116 def demote_subjunctions! @tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!) end |
#find_token(identifier) ⇒ Object
73 74 75 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 73 def find_token(identifier) @tokens.select { |t| t.id == identifier }.first end |
#map_part_of_speech! ⇒ Object
135 136 137 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 135 def map_part_of_speech! roots.each(&:map_part_of_speech!) end |
#prune_empty_rootnodes! ⇒ Object
TODO: this will leave several root nodes in many cases. For now, raise an error
102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 102 def prune_empty_rootnodes! unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty? empty_roots.each do |r| # promote the first dependent to root new_root = r.dependents.first new_root.head_id = 0 new_root.relation = r.relation r.dependents.each { |d| d.head_id = new_root.id } remove_token! r end prune_empty_rootnodes! end end |
#relabel_graph! ⇒ Object
131 132 133 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 131 def relabel_graph! roots.each(&:relabel_graph!) end |
#remove_token!(token) ⇒ Object
77 78 79 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 77 def remove_token!(token) @tokens.delete(token) end |
#restructure_graph! ⇒ Object
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 139 def restructure_graph! @tokens.delete_if { |n| n.empty_token_sort == 'P' } @tokens.select(&:preposition?).each(&:process_preposition!) roots.each(&:change_coordinations!) @tokens.select(&:copula?).each(&:process_copula!) prune_empty_rootnodes! # do ellipses from left to right for proper remnant treatment @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!) demote_subjunctions! # DIRTY: remove the rest of the empty nodes by attaching them # to their grandmother with remnant. This is the best way to # do it given the current state of the UDEP scheme, but # revisions will come. roots.each(&:remove_empties!) demote_parentheticals_and_vocatives! end |
#roots ⇒ Object
89 90 91 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 89 def roots @tokens.select { |t| t.head_id == 0 }.sort_by(&:id) end |
#to_conll ⇒ Object
97 98 99 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 97 def to_conll @tokens.map(&:to_conll).join("\n") end |
#to_graph ⇒ Object
93 94 95 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 93 def to_graph roots.map(&:to_graph).join("\n") end |
#to_s ⇒ Object
81 82 83 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 81 def to_s @tokens.map(&:to_s).join("\n") end |