Class: PROIEL::Converter::PROIELXML
- Inherits:
-
Object
- Object
- PROIEL::Converter::PROIELXML
- Defined in:
- lib/proiel/cli/converters/proielxml.rb
Overview
Converter that outputs PROIEL XML. This is primarily useful for filtering, merging or splitting PROIEL XML data. It is also useful for “upgrading” PROIEL XML to a new version or for testing round tripping of data.
Class Method Summary collapse
- .grab_features(obj, mandatory_features, optional_features = [], overrides = {}) ⇒ Object
- .include_div?(div, options) ⇒ Boolean
- .include_sentence?(sentence, options) ⇒ Boolean
- .include_token?(token, options) ⇒ Boolean
- .process(tb, options) ⇒ Object
- .process_div(builder, tb, source, div, options, overrides) ⇒ Object
- .process_sentence(builder, tb, sentence, options, overrides) ⇒ Object
- .process_token(builder, tb, token, options, overrides) ⇒ Object
Class Method Details
.grab_features(obj, mandatory_features, optional_features = [], overrides = {}) ⇒ Object
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 189 def grab_features(obj, mandatory_features, optional_features = [], overrides = {}) attrs = {} mandatory_features.each do |f| v = overrides.key?(f) ? overrides[f] : obj.send(f) attrs[f.to_s.gsub('_', '-')] = v end optional_features.each do |f| v = overrides.key?(f) ? overrides[f] : obj.send(f) if v and v.to_s != '' attrs[f.to_s.gsub('_', '-')] = v end end attrs end |
.include_div?(div, options) ⇒ Boolean
78 79 80 81 82 83 84 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 78 def include_div?(div, ) if ['remove-empty-divs'] div.sentences.any? { |sentence| include_sentence?(sentence, ) } else true end end |
.include_sentence?(sentence, options) ⇒ Boolean
86 87 88 89 90 91 92 93 94 95 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 86 def include_sentence?(sentence, ) case sentence.status when :reviewed not ['remove-reviewed'] and not ['remove-annotated'] when :annotated not ['remove-not-reviewed'] and not ['remove-annotated'] else not ['remove-not-reviewed'] and not ['remove-not-annotated'] end end |
.include_token?(token, options) ⇒ Boolean
97 98 99 100 101 102 103 104 105 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 97 def include_token?(token, ) if ['remove-syntax'] and (token.empty_token_sort == 'C' or token.empty_token_sort == 'V') false elsif token.empty_token_sort == 'P' and ['remove-information-structure'] false else true end end |
.process(tb, options) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 7 def process(tb, ) builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2) builder.instruct! :xml, version: '1.0', encoding: 'UTF-8' builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do builder.annotation do builder.relations do tb.annotation_schema..each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary primary secondary))) builder.value(attrs) end end builder.tag! 'parts-of-speech' do tb.annotation_schema..each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary))) builder.value(attrs) end end builder.morphology do tb.annotation_schema..each do |cat_tag, cat_values| builder.field(tag: cat_tag) do cat_values.each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary))) builder.value(attrs) end end end end builder.tag! 'information-statuses' do tb.annotation_schema..each do |tag, value| attrs = { tag: tag } attrs.merge!(grab_features(value, %i(summary))) builder.value(attrs) end end end tb.sources.each do |source| next if ['remove-unaligned-sources'] and source.alignment_id.nil? mandatory_features = %i(id language) optional_features = [] optional_features += %i(alignment_id) unless ['remove-alignments'] builder.source(grab_features(source, mandatory_features, optional_features)) do PROIEL::Treebank::METADATA_ELEMENTS.each do |field| builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field) end source.divs.each do |div| if include_div?(div, ) overrides = { div: {}, sentence: {}, token: {} } process_div(builder, tb, source, div, , overrides) end end end end end end |
.process_div(builder, tb, source, div, options, overrides) ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 107 def process_div(builder, tb, source, div, , overrides) mandatory_features = %i() optional_features = [] optional_features += %i(presentation_before presentation_after) optional_features += %i(id alignment_id) unless ['remove-alignments'] if ['infer-alignments'] and source.alignment_id aligned_source = tb.find_source(source.alignment_id) # FIXME: how to behave here? overwrite existing? what if nil? how to deal with multiple aligned divs? overrides[:div][:alignment_id] = div.alignment_id || div.inferred_alignment(aligned_source).map(&:id).join(',') end builder.div(grab_features(div, mandatory_features, optional_features, overrides[:div])) do builder.title div.title if div.title div.sentences.select do |sentence| include_sentence?(sentence, ) end.each do |sentence| process_sentence(builder, tb, sentence, , overrides) end end end |
.process_sentence(builder, tb, sentence, options, overrides) ⇒ Object
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 131 def process_sentence(builder, tb, sentence, , overrides) mandatory_features = %i(id) optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier optional_features += %i(status) unless ['remove-status'] optional_features += %i(presentation_before presentation_after) optional_features += %i(alignment_id) unless ['remove-alignments'] optional_features += %i(annotated_at) unless ['remove-annotator'] optional_features += %i(reviewed_at) unless ['remove-reviewer'] optional_features += %i(annotated_by) unless ['remove-annotator'] optional_features += %i(reviewed_by) unless ['remove-reviewer'] builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do sentence.tokens.select do |token| include_token?(token, ) end.each do |token| process_token(builder, tb, token, , overrides) end end end |
.process_token(builder, tb, token, options, overrides) ⇒ Object
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# File 'lib/proiel/cli/converters/proielxml.rb', line 152 def process_token(builder, tb, token, , overrides) mandatory_features = %i(id) optional_features = %i(citation_part) optional_features += %i(lemma part_of_speech morphology) unless ['remove-morphology'] optional_features += %i(head_id relation) unless ['remove-syntax'] optional_features += %i(antecedent_id information_status contrast_group) unless ['remove-information-structure'] unless token.is_empty? mandatory_features << :form optional_features += %i(presentation_before presentation_after foreign_ids) else mandatory_features << :empty_token_sort end if ['remove-not-reviewed'] or ['remove-not-annotated'] or ['remove-annotated'] or ['remove-annotated'] overrides[:token][:antecedent_id] = (token.antecedent_id and include_sentence?(tb.find_token(token.antecedent_id.to_i).sentence, )) ? token.antecedent_id : nil end optional_features += %i(alignment_id) unless ['remove-alignments'] attrs = grab_features(token, mandatory_features, optional_features, overrides[:token]) unless token.slashes.empty? or ['remove-syntax'] # this extra test avoids <token></token> style XML builder.token(attrs) do token.slashes.each do |relation, target_id| builder.slash(:"target-id" => target_id, relation: relation) end end else unless ['remove-syntax'] and token.is_empty? builder.token(attrs) end end end |