Class: PROIEL::Converter::PROIELXML

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/proielxml.rb

Overview

Converter that outputs PROIEL XML. This is primarily useful for filtering, merging or splitting PROIEL XML data. It is also useful for “upgrading” PROIEL XML to a new version or for testing round tripping of data.

Class Method Summary collapse

Class Method Details

.grab_features(obj, mandatory_features, optional_features = [], overrides = {}) ⇒ Object



189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/proiel/cli/converters/proielxml.rb', line 189

def grab_features(obj, mandatory_features, optional_features = [], overrides = {})
  attrs = {}

  mandatory_features.each do |f|
    v = overrides.key?(f) ? overrides[f] : obj.send(f)

    attrs[f.to_s.gsub('_', '-')] = v
  end

  optional_features.each do |f|
    v = overrides.key?(f) ? overrides[f] : obj.send(f)

    if v and v.to_s != ''
      attrs[f.to_s.gsub('_', '-')] = v
    end
  end

  attrs
end

.include_div?(div, options) ⇒ Boolean

Returns:

  • (Boolean)


78
79
80
81
82
83
84
# File 'lib/proiel/cli/converters/proielxml.rb', line 78

def include_div?(div, options)
  if options['remove-empty-divs']
    div.sentences.any? { |sentence| include_sentence?(sentence, options) }
  else
    true
  end
end

.include_sentence?(sentence, options) ⇒ Boolean

Returns:

  • (Boolean)


86
87
88
89
90
91
92
93
94
95
# File 'lib/proiel/cli/converters/proielxml.rb', line 86

def include_sentence?(sentence, options)
  case sentence.status
  when :reviewed
    not options['remove-reviewed'] and not options['remove-annotated']
  when :annotated
    not options['remove-not-reviewed'] and not options['remove-annotated']
  else
    not options['remove-not-reviewed'] and not options['remove-not-annotated']
  end
end

.include_token?(token, options) ⇒ Boolean

Returns:

  • (Boolean)


97
98
99
100
101
102
103
104
105
# File 'lib/proiel/cli/converters/proielxml.rb', line 97

def include_token?(token, options)
  if options['remove-syntax'] and (token.empty_token_sort == 'C' or token.empty_token_sort == 'V')
    false
  elsif token.empty_token_sort == 'P' and options['remove-information-structure']
    false
  else
    true
  end
end

.process(tb, options) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/proiel/cli/converters/proielxml.rb', line 7

def process(tb, options)
  builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
  builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
  builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
    builder.annotation do
      builder.relations do
        tb.annotation_schema.relation_tags.each do |tag, value|
          attrs = { tag: tag }
          attrs.merge!(grab_features(value, %i(summary primary secondary)))
          builder.value(attrs)
        end
      end

      builder.tag! 'parts-of-speech' do
        tb.annotation_schema.part_of_speech_tags.each do |tag, value|
          attrs = { tag: tag }
          attrs.merge!(grab_features(value, %i(summary)))
          builder.value(attrs)
        end
      end

      builder.morphology do
        tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values|
          builder.field(tag: cat_tag) do
            cat_values.each do |tag, value|
              attrs = { tag: tag }
              attrs.merge!(grab_features(value, %i(summary)))
              builder.value(attrs)
            end
          end
        end
      end

      builder.tag! 'information-statuses' do
        tb.annotation_schema.information_status_tags.each do |tag, value|
          attrs = { tag: tag }
          attrs.merge!(grab_features(value, %i(summary)))
          builder.value(attrs)
        end
      end
    end

    tb.sources.each do |source|
      next if options['remove-unaligned-sources'] and source.alignment_id.nil?

      mandatory_features = %i(id language)
      optional_features = []
      optional_features += %i(alignment_id) unless options['remove-alignments']

      builder.source(grab_features(source, mandatory_features, optional_features)) do
        PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
          builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
        end

        source.divs.each do |div|
          if include_div?(div, options)

            overrides = {
              div: {},
              sentence: {},
              token: {}
            }

            process_div(builder, tb, source, div, options, overrides)
          end
        end
      end
    end
  end
end

.process_div(builder, tb, source, div, options, overrides) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/proiel/cli/converters/proielxml.rb', line 107

def process_div(builder, tb, source, div, options, overrides)
  mandatory_features = %i()

  optional_features = []
  optional_features += %i(presentation_before presentation_after)
  optional_features += %i(id alignment_id) unless options['remove-alignments']

  if options['infer-alignments'] and source.alignment_id
    aligned_source = tb.find_source(source.alignment_id)
    # FIXME: how to behave here? overwrite existing? what if nil? how to deal with multiple aligned divs?
    overrides[:div][:alignment_id] = div.alignment_id || div.inferred_alignment(aligned_source).map(&:id).join(',')
  end

  builder.div(grab_features(div, mandatory_features, optional_features, overrides[:div])) do
    builder.title div.title if div.title

    div.sentences.select do |sentence|
      include_sentence?(sentence, options)
    end.each do |sentence|
      process_sentence(builder, tb, sentence, options, overrides)
    end
  end
end

.process_sentence(builder, tb, sentence, options, overrides) ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/proiel/cli/converters/proielxml.rb', line 131

def process_sentence(builder, tb, sentence, options, overrides)
  mandatory_features = %i(id)

  optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
  optional_features += %i(status) unless options['remove-status']
  optional_features += %i(presentation_before presentation_after)
  optional_features += %i(alignment_id) unless options['remove-alignments']
  optional_features += %i(annotated_at) unless options['remove-annotator']
  optional_features += %i(reviewed_at) unless options['remove-reviewer']
  optional_features += %i(annotated_by) unless options['remove-annotator']
  optional_features += %i(reviewed_by) unless options['remove-reviewer']

  builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
    sentence.tokens.select do |token|
      include_token?(token, options)
    end.each do |token|
      process_token(builder, tb, token, options, overrides)
    end
  end
end

.process_token(builder, tb, token, options, overrides) ⇒ Object



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/proiel/cli/converters/proielxml.rb', line 152

def process_token(builder, tb, token, options, overrides)
  mandatory_features = %i(id)

  optional_features = %i(citation_part)
  optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology']
  optional_features += %i(head_id relation) unless options['remove-syntax']
  optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure']

  unless token.is_empty?
    mandatory_features << :form
    optional_features += %i(presentation_before presentation_after foreign_ids)
  else
    mandatory_features << :empty_token_sort
  end

  if options['remove-not-reviewed'] or options['remove-not-annotated'] or options['remove-annotated'] or options['remove-annotated']
    overrides[:token][:antecedent_id] =
      (token.antecedent_id and include_sentence?(tb.find_token(token.antecedent_id.to_i).sentence, options)) ? token.antecedent_id : nil
  end

  optional_features += %i(alignment_id) unless options['remove-alignments']

  attrs = grab_features(token, mandatory_features, optional_features, overrides[:token])

  unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
    builder.token(attrs) do
      token.slashes.each do |relation, target_id|
        builder.slash(:"target-id" => target_id, relation: relation)
      end
    end
  else
    unless options['remove-syntax'] and token.is_empty?
      builder.token(attrs)
    end
  end
end