Class: Opener::PropertyTagger::Processor

Inherits:
Object
  • Object
show all
Defined in:
lib/opener/property_tagger/processor.rb

Overview

Class that applies property tagging to a given input KAF file.

Constant Summary collapse

FILE_ASPECTS_CACHE =

Global cache used for storing loaded aspects.

FileAspectsCache.new
REMOTE_ASPECTS_CACHE =
RemoteAspectsCache.new

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file, params: {}, url: nil, path: nil, timestamp: true, pretty: false) ⇒ Processor

Returns a new instance of Processor.

Parameters:

  • file (String|IO)

    The KAF file/input to process.

  • aspects_path (String)

    Path to the aspects.

  • timestamp (TrueClass|FalseClass) (defaults to: true)

    Add timestamps to the KAF.

  • pretty (TrueClass|FalseClass) (defaults to: false)

    Enable pretty formatting, disabled by default due to the performance overhead.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/opener/property_tagger/processor.rb', line 27

def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
  @document     = Nokogiri.XML file
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
  @timestamp    = timestamp
  @pretty       = pretty

  @params       = params
  @remote       = !url.nil?
  @aspects_path = path
  @aspects_url  = url
  @cache_keys   = params[:cache_keys]
  @cache_keys.merge! lang: @document.root.attr('xml:lang')

  @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
end

Instance Attribute Details

#aspectsObject

Returns the value of attribute aspects.



9
10
11
# File 'lib/opener/property_tagger/processor.rb', line 9

def aspects
  @aspects
end

#aspects_pathObject

Returns the value of attribute aspects_path.



9
10
11
# File 'lib/opener/property_tagger/processor.rb', line 9

def aspects_path
  @aspects_path
end

#aspects_urlObject

Returns the value of attribute aspects_url.



9
10
11
# File 'lib/opener/property_tagger/processor.rb', line 9

def aspects_url
  @aspects_url
end

#documentObject

Returns the value of attribute document.



8
9
10
# File 'lib/opener/property_tagger/processor.rb', line 8

def document
  @document
end

#prettyObject

Returns the value of attribute pretty.



10
11
12
# File 'lib/opener/property_tagger/processor.rb', line 10

def pretty
  @pretty
end

#timestampObject

Returns the value of attribute timestamp.



10
11
12
# File 'lib/opener/property_tagger/processor.rb', line 10

def timestamp
  @timestamp
end

Instance Method Details

#add_features_layerObject

Remove the features layer from the KAF file if it exists and add a new one.



124
125
126
127
128
129
130
# File 'lib/opener/property_tagger/processor.rb', line 124

def add_features_layer
  existing = document.at_xpath('KAF/features')

  existing.remove if existing

  new_node('features', 'KAF')
end

#add_linguistic_processorObject



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/opener/property_tagger/processor.rb', line 160

def add_linguistic_processor
  description = 'VUA property tagger'
  last_edited = '16jan2015'
  version     = '2.0'

  node = new_node('linguisticProcessors', 'KAF/kafHeader')
  node['layer'] = 'features'

  lp_node = new_node('lp', node)

  lp_node['version'] = "#{last_edited}-#{version}"
  lp_node['name']    = description

  if timestamp
    format = '%Y-%m-%dT%H:%M:%S%Z'

    lp_node['timestamp'] = Time.now.strftime(format)
  else
    lp_node['timestamp'] = '*'
  end
end

#add_properties_layerObject

Add the properties layer as a child to the features layer.



134
135
136
# File 'lib/opener/property_tagger/processor.rb', line 134

def add_properties_layer
  new_node("properties", "KAF/features")
end

#add_property(key, value, index) ⇒ Object



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/opener/property_tagger/processor.rb', line 138

def add_property(key, value, index)
  property_node = new_node("property", "KAF/features/properties")

  property_node['lemma'] = key.to_s
  property_node['pid']   = "p#{index.to_s}"

  references_node = new_node("references", property_node)

  value.uniq.each do |v|
    comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
    references_node.add_child comm_node

    span_node = new_node("span", references_node)

    v.first.each do |val|
      target_node       = new_node("target", span_node)

      target_node['id'] = val.to_s
    end
  end
end

#extract_aspectsHash

Check which terms belong to an aspect (property) Text have priority over Lemmas, overriding if there is a conflict

Returns:

  • (Hash)


85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/opener/property_tagger/processor.rb', line 85

def extract_aspects
  term_ids     = terms.keys
  lemmas       = terms.values
  uniq_aspects = Hash.new { |hash, key| hash[key] = [] }

  [:lemma, :text].each do |k|
    current_token = 0
    # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
    # lemmas) belong to a property.
    max_ngram = 2


    while current_token < terms.count
      (0..max_ngram).each do |tam_ngram|
        if current_token + tam_ngram <= terms.count
          ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase

          if aspects[ngram.to_sym]
            properties = aspects[ngram.to_sym]
            ids        = term_ids[current_token..current_token+tam_ngram]

            properties.uniq.each do |property|
              next if !property or property.strip.empty?

              uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
            end
          end
        end
      end
      current_token += 1
    end
  end

  Hash[uniq_aspects.sort]
end

#languageObject



64
65
66
# File 'lib/opener/property_tagger/processor.rb', line 64

def language
  @language ||= document.at_xpath('KAF').attr('xml:lang')
end

#pretty_print(document) ⇒ String

Format the output document properly.

TODO: this should be handled by Oga in a nice way.

Returns:

  • (String)


189
190
191
192
193
194
195
196
197
198
# File 'lib/opener/property_tagger/processor.rb', line 189

def pretty_print(document)
  doc = REXML::Document.new document.to_xml
  doc.context[:attribute_quote] = :quote
  out = ""
  formatter = REXML::Formatters::Pretty.new
  formatter.compact = true
  formatter.write(doc, out)

  out.strip
end

#processString

Processes the input and returns the new KAF output.

Returns:

  • (String)


47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/opener/property_tagger/processor.rb', line 47

def process
  existing_aspects = extract_aspects

  add_features_layer
  add_properties_layer

  existing_aspects.each_with_index do |(key, value), index|
    index += 1

    add_property(key, value, index)
  end

  add_linguistic_processor

  pretty ? pretty_print(document) : document.to_xml
end

#termsObject



68
69
70
71
72
73
74
75
76
77
78
# File 'lib/opener/property_tagger/processor.rb', line 68

def terms
  unless @terms
    @terms = {}

    document.xpath('KAF/terms/term').each do |term|
      @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
    end
  end

  @terms
end