Class: Treat::Workers::Processors::Parsers::Enju

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/workers/processors/parsers/enju.rb

Overview

This class is a wrapper for the Enju syntactic parser for English. Given an entity’s string value, the parser formats it runs it through Enju, and parses the XML output by Enju using the Nokogiri XML reader. It creates wrappers for the sentences, syntactical phrases and tokens that Enju identified.

Original paper: Takuya M., Yusuke M., and Jun’ichi T.

  1. Efficient HPSG Parsing with Supertagging and

CFG-filtering. In Proceedings of IJCAI 2007.

Constant Summary collapse

Ectc =

A hash of Enju cat tags mapped to word categories.

Treat.tags.enju.cat_to_category
Ecxtp =

A hash of Enju cat/xcat pairs mapped to PTB tags.

Treat.tags.enju.xcat_to_ptb
@@parser =

Create only one process and hold on to it.

nil

Class Method Summary collapse

Class Method Details

.add_edges(entity2) ⇒ Object

Add edges a posteriori to a parsed entity.



181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/treat/workers/processors/parsers/enju.rb', line 181

def self.add_edges(entity2)
  
  entity2.each_entity(:word, :phrase) do |entity|
    @@edges_table.each_pair do |id, edges|
      next if edges.nil?
      entity = entity2.root.find(id)
      next if entity.nil?
      edges.each_pair do |argument, type|
        # Skip this argument if we 
        # don't know the target node.
        next if argument == 'unk'
        entity.link(
          @@id_table[argument], 
          type.intern
        )
      end
    end
  end
  
end

.build(xml, remove_last = false) ⇒ Object

Parses an Enju XML output file using the Nogoriki XML reader and converts that structure into a tree of wrappers for textual entities.



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/treat/workers/processors/parsers/enju.rb', line 81

def self.build(xml, remove_last = false)
  # Read in the XML file.
  reader = Nokogiri::XML::Reader.from_memory(xml)
  entity = nil
  pd = 0
  # Read the XML file entity by entity.
  while reader.read
    # The depth in the XML tree.
    cd = reader.depth
    # If we are at the end of the 
    # children stack, pop up.
    if pd > cd
      entity = entity.parent
    end
    # If an end element has been reached,
    # change the depth and pop up on next
    # iteration.
    if reader.node_type ==
      Nokogiri::XML::Reader::TYPE_END_ELEMENT
      pd = cd
      next
    end
    # Get and format attributes and edges.
    attributes = reader.attributes
    id = attributes.delete('id')
    new_attr = {}; edges = {}
    unless attributes.size == 0
      new_attr, edges =
      cleanup_attributes(reader.name, attributes)
    end
    # Create the appropriate entity for the
    # element.
    current_value = ''
    case reader.name
    when 'sentence'
      entity = Treat::Entities::Sentence.new('')
      @@id_table[id] = entity.id
      @@edges_table[entity.id] = edges
      entity.features = new_attr
    when 'cons'
      entity = entity <<
      Treat::Entities::Phrase.new('')
      @@id_table[id] = entity.id
      @@edges_table[entity.id] = edges
      entity.features = new_attr
    when 'tok'
      tmp_attributes = new_attr
      tmp_edges = edges
    else
      current_value = reader.value.gsub(/\s+/, "")
      unless current_value.size == 0
        entity = entity <<
        Treat::Entities::Token.from_string(current_value)
        if entity.is_a?(Treat::Entities::Word)
          entity.features = tmp_attributes
          @@id_table[id] = entity.id
          @@edges_table[entity.id] = tmp_edges
        else
          # Do something useful here
          entity.set :tag, 'SYM'
        end
      end
    end
    pd = cd
  end
  entity
end

.cleanup_attributes(name, attributes) ⇒ Object

Helper function to convert Enju attributes to Treat attributes.



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/treat/workers/processors/parsers/enju.rb', line 203

def self.cleanup_attributes(name, attributes)
  
  new_attr = {}
  edges = {}
  pred = attributes.delete('pred')
  
  attributes.each_pair do |attribute2, value|
    
    attribute = attribute2.strip
    
    if attribute == 'arg1' || 
      attribute == 'arg2'
      edges[value] = pred
      next
    end
    
    if attribute == 'cat'
      new_attr[:cat] = value
      if name == 'tok'
        if value.length > 1 && 
          ['P', 'X'].include?(value[-1]) &&
          value != 'PN'
          new_attr[:saturated] = 
          (value[-1] == 'P')
          value = value[0..-2]
        end
        new_attr[:category] = Ectc[value]
      else
        tags = Ecxtp.select do |m|
          m[0] == value && m[1] == 
          attributes['xcat']
        end
        tag = (tags.size == 0) ? 
        'FW' : tags[0][2]
        new_attr[:tag] = tag
      end
    else
      new_attr[:"#{attribute}"] = value
    end
    
  end
  
  # Handle naming conventions.
  if attributes.has_key?('pos')
    new_attr[:tag] = new_attr[:pos]
    new_attr[:tag_set] = :penn
    new_attr.delete :pos
  end
  
  if attributes.has_key?('base')
    new_attr[:lemma] = new_attr[:base]
    new_attr.delete :base
  end
  
  return new_attr, edges

end

Link the head and sem_head to their entities.



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/treat/workers/processors/parsers/enju.rb', line 163

def self.link_heads(entity)
  entity.each_phrase do |phrase|
    if phrase.has?(:head)
      phrase.link(
      @@id_table[phrase.head], 
      'head', true, -1)
      phrase.unset(:head)
    end
    if phrase.has?(:sem_head)
      phrase.link(
      @@id_table[phrase.sem_head], 
      'sem_head', true, -1)
      phrase.unset(:sem_head)
    end
  end
end

.parse(entity, options = {}) ⇒ Object

Parse the entity into its syntactical phrases using Enju.

Options: none.



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/treat/workers/processors/parsers/enju.rb', line 33

def self.parse(entity, options = {})
  
  entity.check_hasnt_children
  val = entity.to_s
  
  @@id_table = {}
  @@edges_table = {}
  
  stdin, stdout = proc
  text, remove_last = valid_text(val)
  stdin.puts(text + "\n")
  
  parsed = build(stdout.gets, remove_last)
  
  if parsed
    entity.remove_all!
    parsed.children.each do |child|
      entity << child
    end
    # Remove the period we added at the end.
    if remove_last
      last = entity.punctuations[-1]
      entity.remove!(last)
    end
  else
    warn "Warning - Enju couldn't " +
    "parse the text '#{entity.short_value}'."
    return
  end
  
  link_heads(entity)
  add_edges(entity)
end

.procObject

Return the process running Enju.



68
69
70
71
72
73
74
75
76
# File 'lib/treat/workers/processors/parsers/enju.rb', line 68

def self.proc
  begin
    @@parser = ::Open3.popen3("enju -xml -i")
  rescue Exception => e
    raise Treat::Exception,
    "Couldn't initialize Enju: #{e.message}."
  end
  @@parser
end

.valid_text(val) ⇒ Object

Validate a text - Enju wants period to parse a sentence.



150
151
152
153
154
155
156
157
158
159
160
# File 'lib/treat/workers/processors/parsers/enju.rb', line 150

def self.valid_text(val)
  if val.count('.') == 0
    remove_last = true
    text = val + '.'
  else
    remove_last = false
    text = val.gsub('.', '')
    text += '.' unless ['!', '?'].include?(text[-1])
  end
  return text, remove_last
end