Class: Treat::Workers::Formatters::Readers::XML

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/workers/formatters/readers/xml.rb

Constant Summary collapse

DefaultOptions =

By default, don’t backup the XML document while cleaning it.

{
  :keep_html => false
}
@@xml_reader =

Hold one instance of the XML cleaner.

nil

Class Method Summary collapse

Class Method Details

.read(document, options = {}) ⇒ Object

Read the XML document and strip it of its markup. Also segments and tokenizes the text.

Options:

  • (Boolean) :keep_xml => whether to backup the XML markup while cleaning.



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/treat/workers/formatters/readers/xml.rb', line 22

def self.read(document, options = {})

  raise 'Not implemented.'

  options = DefaultOptions.merge(options)

  xml = File.read(document.file)

  @@xml_reader ||= StanfordCoreNLP.load(
  :tokenize, :ssplit, :cleanxml)

  text = StanfordCoreNLP::Annotation.new(xml)
  @@xml_reader.annotate(text)

  text.get(:sentences).each do |sentence|

    s = Treat::Entities::Sentence.
    from_string(sentence.to_s, true)

    sentence.get(:tokens).each do |token|
      val = token.value.to_s.strip.gsub('\/', '/')
      next if val =~ /^<[^>]+>$/

      t = Treat::Entities::Token.
      from_string(val)
      c = token.get(:xml_context)

      if c
        context = []
        c.each { |tag| context << tag.to_s }
        t.set :xml_context, context
      end

      s << t

    end

    if Treat::Entities::Zone.from_string('')
      section << s
    end

    if options[:backup]
      document.set :xml_value,
      CGI.escapeHTML(text.to_s)
    end

    document.value = ''

  end
  
  document.set :format, 'xml'
  document
  
end