Class: Opener::PropertyTagger::Processor

Inherits:
Object
  • Object
show all
Defined in:
lib/opener/property_tagger/processor.rb

Overview

Class that applies property tagging to a given input KAF file.

Constant Summary collapse

FILE_ASPECTS_CACHE =

Global cache used for storing loaded aspects.

FileAspectsCache.new
REMOTE_ASPECTS_CACHE =
RemoteAspectsCache.new
MAX_NGRAM =

Use of n-grams to determine if a unigram (1 lemma) or bigram (2 lemmas) belong to a property.

2

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file, params: {}, url: nil, path: nil, timestamp: true, pretty: false) ⇒ Processor



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/opener/property_tagger/processor.rb', line 28

def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
  @document     = Nokogiri.XML file
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
  @timestamp    = timestamp
  @pretty       = pretty

  @params       = params
  @remote       = !url.nil?
  @aspects_path = path
  @aspects_url  = url
  @cache_keys   = params[:cache_keys] || {}
  @cache_keys.merge! lang: @document.root.attr('xml:lang')

  @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
end

Instance Attribute Details

#aspectsObject

Returns the value of attribute aspects.



10
11
12
# File 'lib/opener/property_tagger/processor.rb', line 10

def aspects
  @aspects
end

#aspects_pathObject

Returns the value of attribute aspects_path.



9
10
11
# File 'lib/opener/property_tagger/processor.rb', line 9

def aspects_path
  @aspects_path
end

#aspects_urlObject

Returns the value of attribute aspects_url.



9
10
11
# File 'lib/opener/property_tagger/processor.rb', line 9

def aspects_url
  @aspects_url
end

#documentObject

Returns the value of attribute document.



8
9
10
# File 'lib/opener/property_tagger/processor.rb', line 8

def document
  @document
end

#lexiconsObject

Returns the value of attribute lexicons.



10
11
12
# File 'lib/opener/property_tagger/processor.rb', line 10

def lexicons
  @lexicons
end

#prettyObject

Returns the value of attribute pretty.



11
12
13
# File 'lib/opener/property_tagger/processor.rb', line 11

def pretty
  @pretty
end

#timestampObject

Returns the value of attribute timestamp.



11
12
13
# File 'lib/opener/property_tagger/processor.rb', line 11

def timestamp
  @timestamp
end

Instance Method Details

#add_features_layerObject

Remove the features layer from the KAF file if it exists and add a new one.



127
128
129
130
131
132
133
# File 'lib/opener/property_tagger/processor.rb', line 127

def add_features_layer
  existing = document.at_xpath('KAF/features')

  existing.remove if existing

  new_node('features', 'KAF')
end

#add_linguistic_processorObject



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/opener/property_tagger/processor.rb', line 164

def add_linguistic_processor
  description = 'VUA property tagger'
  last_edited = '16jan2015'
  version     = '2.0'

  node = new_node('linguisticProcessors', 'KAF/kafHeader')
  node['layer'] = 'features'

  lp_node = new_node('lp', node)

  lp_node['version'] = "#{last_edited}-#{version}"
  lp_node['name']    = description

  if timestamp
    format = '%Y-%m-%dT%H:%M:%S%Z'

    lp_node['timestamp'] = Time.now.strftime(format)
  else
    lp_node['timestamp'] = '*'
  end
end

#add_properties_layerObject

Add the properties layer as a child to the features layer.



137
138
139
# File 'lib/opener/property_tagger/processor.rb', line 137

def add_properties_layer
  new_node("properties", "KAF/features")
end

#add_property(lemma, values, index) ⇒ Object



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/opener/property_tagger/processor.rb', line 141

def add_property lemma, values, index
  property_node = new_node("property", "KAF/features/properties")

  property_node['lemma'] = lemma.to_s
  property_node['pid']   = "p#{index.to_s}"

  references_node = new_node("references", property_node)

  values.each do |v|
    comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
    references_node.add_child comm_node

    span_node = new_node 'span', references_node

    v.term_ids.each do |id|
      target_node       = new_node 'target', span_node

      target_node['id'] = id.to_s
      target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
    end
  end
end

#extract_aspectsHash

Check which terms belong to an aspect (property) Text have priority over Lemmas, overriding if there is a conflict



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/opener/property_tagger/processor.rb', line 88

def extract_aspects
  all_term_ids = terms.keys
  lemmas       = terms.values
  uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }

  [:lemma, :text].each do |k|
    current_token = 0

    while current_token < terms.count
      (0..MAX_NGRAM).each do |tam_ngram|
        next unless current_token + tam_ngram <= terms.count

        ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase

        @lexicons[ngram.to_sym]&.each do |l|
          properties = if l.aspects.present? then l.aspects else [l.aspect] end
          properties.each do |p|
            next if p.blank?
            term_ids = all_term_ids[current_token..current_token+tam_ngram]
            next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }

            uniq_aspects[p.to_sym] << Hashie::Mash.new(
              term_ids: term_ids,
              ngram:    ngram,
              lexicon:  l,
            )
          end
        end
      end
      current_token += 1
    end
  end

  Hash[uniq_aspects.sort]
end

#languageObject



63
64
65
# File 'lib/opener/property_tagger/processor.rb', line 63

def language
  @language ||= document.at_xpath('KAF').attr('xml:lang')
end

#pretty_print(document) ⇒ String

Format the output document properly.

TODO: this should be handled by Oga in a nice way.



193
194
195
196
197
198
199
200
201
202
# File 'lib/opener/property_tagger/processor.rb', line 193

def pretty_print(document)
  doc = REXML::Document.new document.to_xml
  doc.context[:attribute_quote] = :quote
  out = ""
  formatter = REXML::Formatters::Pretty.new
  formatter.compact = true
  formatter.write(doc, out)

  out.strip
end

#processString

Processes the input and returns the new KAF output.



48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/opener/property_tagger/processor.rb', line 48

def process
  add_features_layer
  add_properties_layer

  extract_aspects.each.with_index do |(lemma, values), index|
    index += 1

    add_property lemma, values, index
  end

  add_linguistic_processor

  pretty ? pretty_print(document) : document.to_xml
end

#termsObject



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/opener/property_tagger/processor.rb', line 67

def terms
  unless @terms
    @terms = {}

    document.xpath('KAF/terms/term').each do |term|
      @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
    end
  end

  @terms
end