Class: PROIEL::Converter::PROIELXML

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/proielxml.rb

Class Method Summary collapse

Class Method Details

.grab_features(obj, mandatory_features, optional_features = []) ⇒ Object



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/proiel/cli/converters/proielxml.rb', line 147

def grab_features(obj, mandatory_features, optional_features = [])
  attrs = {}

  mandatory_features.each do |f|
    v = obj.send(f)

    attrs[f.to_s.gsub('_', '-')] = v
  end

  optional_features.each do |f|
    v = obj.send(f)

    if v and v.to_s != ''
      attrs[f.to_s.gsub('_', '-')] = v
    end
  end

  attrs
end

.include_div?(div, options) ⇒ Boolean

Returns:

  • (Boolean)


128
129
130
131
132
133
134
# File 'lib/proiel/cli/converters/proielxml.rb', line 128

def include_div?(div, options)
  if options['remove-empty-divs']
    div.sentences.any? { |sentence| include_sentence?(sentence, options) }
  else
    true
  end
end

.include_sentence?(sentence, options) ⇒ Boolean

Returns:

  • (Boolean)


136
137
138
139
140
141
142
143
144
145
# File 'lib/proiel/cli/converters/proielxml.rb', line 136

def include_sentence?(sentence, options)
  case sentence.status
  when :reviewed
    true
  when :annotated
    not options['remove-not-reviewed']
  else
    not options['remove-not-reviewed'] and not options['remove-not-annotated']
  end
end

.process(tb, options) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/proiel/cli/converters/proielxml.rb', line 5

def process(tb, options)
  builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
  builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
  builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
    builder.annotation do
      builder.relations do
        tb.annotation_schema.relation_tags.each do |tag, value|
          attrs = { tag: tag }
          attrs.merge!(grab_features(value, %i(summary primary secondary)))
          builder.value(attrs)
        end
      end

      builder.tag! 'parts-of-speech' do
        tb.annotation_schema.part_of_speech_tags.each do |tag, value|
          attrs = { tag: tag }
          attrs.merge!(grab_features(value, %i(summary)))
          builder.value(attrs)
        end
      end

      builder.morphology do
        tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values|
          builder.field(tag: cat_tag) do
            cat_values.each do |tag, value|
              attrs = { tag: tag }
              attrs.merge!(grab_features(value, %i(summary)))
              builder.value(attrs)
            end
          end
        end
      end

      builder.tag! 'information-statuses' do
        tb.annotation_schema.information_status_tags.each do |tag, value|
          attrs = { tag: tag }
          attrs.merge!(grab_features(value, %i(summary)))
          builder.value(attrs)
        end
      end
    end

    tb.sources.each do |source|
      mandatory_features = %i(id language)
      optional_features = []
      optional_features += %i(alignment_id) unless options['remove-alignments']

      builder.source(grab_features(source, mandatory_features, optional_features)) do
        PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
          builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
        end

        source.divs.each do |div|
          if include_div?(div, options)
            mandatory_features = %i()

            optional_features = []
            optional_features += %i(presentation_before presentation_after)
            optional_features += %i(alignment_id) unless options['remove-alignments']

            builder.div(grab_features(div, mandatory_features, optional_features)) do
              builder.title div.title if div.title

              div.sentences.each do |sentence|
                if include_sentence?(sentence, options)
                  mandatory_features = %i(id)

                  optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
                  optional_features += %i(status) unless options['remove-status']
                  optional_features += %i(presentation_before presentation_after)
                  optional_features += %i(alignment_id) unless options['remove-alignments']
                  optional_features += %i(annotated_at) unless options['remove-annotator']
                  optional_features += %i(reviewed_at) unless options['remove-reviewer']
                  optional_features += %i(annotated_by) unless options['remove-annotator']
                  optional_features += %i(reviewed_by) unless options['remove-reviewer']

                  builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
                    sentence.tokens.each do |token|
                      next if token.empty_token_sort == 'P' and options['remove-information-structure']
                      next if token.empty_token_sort == 'C' and options['remove-syntax']
                      next if token.empty_token_sort == 'V' and options['remove-syntax']

                      mandatory_features = %i(id)

                      optional_features = %i(citation_part)
                      optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology']
                      optional_features += %i(head_id relation) unless options['remove-syntax']
                      optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure']

                      unless token.is_empty?
                        mandatory_features << :form
                        optional_features += %i(presentation_before presentation_after foreign_ids)
                      else
                        mandatory_features << :empty_token_sort
                      end

                      optional_features += %i(alignment_id) unless options['remove-alignments']

                      attrs = grab_features(token, mandatory_features, optional_features)

                      unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
                        builder.token(attrs) do
                          token.slashes.each do |relation, target_id|
                            builder.slash(:"target-id" => target_id, relation: relation)
                          end
                        end
                      else
                        unless options['remove-syntax'] and token.is_empty?
                          builder.token(attrs)
                        end
                      end
                    end
                  end
                end
              end
            end
          end
        end
      end
    end
  end
end