Class: Dphil::TeiXML

Inherits:
Object
  • Object
show all
Defined in:
lib/dphil/tei_xml.rb

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ TeiXML

Public: Initialize a TeiXML object



8
9
10
11
# File 'lib/dphil/tei_xml.rb', line 8

def initialize(source)
  source = %(<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0"></TEI>) if source.strip.empty?
  @raw_xml = source
end

Instance Method Details

#crop(expr) ⇒ Object

Public: Return a portion of the document as a new document

expr - a CSS selector or XPath expression

Returns a new document.



40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/dphil/tei_xml.rb', line 40

def crop(expr)
  segment = xml.search(expr)
  pb = page_of(segment)
  lb = line_of(segment)

  source = <<~EOS
    <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
      <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
      #{segment.to_xml}
      <post></post>
    </TEI>
  EOS
  self.class.new(source)
end

#crop_each(expr) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/dphil/tei_xml.rb', line 55

def crop_each(expr)
  xml.search(expr).map do |segment|
    pb = page_of(segment)
    lb = line_of(segment)

    source = <<~EOS
      <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
        <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
        #{segment.to_xml}
        <post></post>
      </TEI>
    EOS
    self.class.new(source)
  end
end

#empty?Boolean

Returns:

  • (Boolean)


31
32
33
# File 'lib/dphil/tei_xml.rb', line 31

def empty?
  xml.xpath("//text()[normalize-space()]").empty?
end

#reject(expr) ⇒ Object

Public: Remove elements from the document based on CSS selector.

expr - a CSS selector or XPath expression

Returns a new document.



76
77
78
79
80
81
82
# File 'lib/dphil/tei_xml.rb', line 76

def reject(expr)
  source = xml.dup
  source.search(expr).each do |node|
    node.replace(node.search("pb, lb"))
  end
  self.class.new(source.to_xml)
end

#subst(expr, subst_text = nil) ⇒ Object

Public: Substitute elements from the document based on CSS selector with

ID-based token text-nodes.

expr - a CSS selector or XPath expression subst_text - an optional text identifier

Returns a new document.



91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/dphil/tei_xml.rb', line 91

def subst(expr, subst_text = nil)
  source = parsed_xml.dup
  subst_text = subst_text.to_s.gsub(/\s+/, "_") unless subst_text.nil?

  source.search(expr).each do |node|
    set = Nokogiri::XML::NodeSet.new(source)
    escaped_text = ":#{node.attribute('id').to_s.gsub(/\s+/, '_')}"
    text_content = "#{subst_text || node.name}#{escaped_text}"
    set << Nokogiri::XML::Text.new(" {{#{text_content}}} ", source)
    node.replace(set + node.search("pb, lb"))
  end
  self.class.new(source.to_xml)
end

#to_xmlObject Also known as: to_s



25
26
27
# File 'lib/dphil/tei_xml.rb', line 25

def to_xml
  xml.to_xml
end

#xmlObject

Return or re-parse xml



14
15
16
17
18
19
20
21
22
23
# File 'lib/dphil/tei_xml.rb', line 14

def xml
  @xml ||= begin
    xml = Nokogiri::XML(@raw_xml) { |config| config.strict.noent }
    xml.encoding = "UTF-8"
    xml.remove_namespaces!
    xml_normalize!(xml)
  rescue Nokogiri::XML::SyntaxError => e
    raise "TEIDocument (source: #{@raw_xml}) caught exception: #{e}"
  end
end