Class: XMLRowFinder

Inherits:
Object
  • Object
show all
Defined in:
lib/xml_row_finder.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(raws, debug: false) ⇒ XMLRowFinder

Returns a new instance of XMLRowFinder.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/xml_row_finder.rb', line 12

def initialize(raws, debug: false)

  @debug = debug

  doc = Nokorexi.new(raws, filter: true).to_doc

  @doc = Rexle.new(doc.root.xml)

  a = []

  doc.root.each_recursive do |e|
    e.attributes.delete
    a << e.backtrack.to_xpath
  end

  @to_a = a2 = a.map {|e| [a.count(e), e] }.uniq
  xpath = a2.max_by(&:first).last

  a3 = xpath.split('/')
  a4 = [xpath]
  p1 = []

  until (a3.length < 1) do
    p1 << a3.pop; a4 << a3.join('/') + "[%s]" % p1.reverse.join('/')
  end

  # using Nokogiri since Rexle has a bug with xpath predicates
  #
  @doc2 = Nokogiri::XML(doc.root.xml)

  a5 = a4[0..-2].map do |xpath2|
    [@doc2.xpath(xpath2).length, xpath2]
  end

  puts 'a5: ' + a5.inspect if @debug
  rows_xpath = a5.reverse.detect {|num, xpath2| num > 1}.last
  doc3 = Document.new @doc.root.xml
  @rows = XPath.match(doc3, rows_xpath)
  @xpath = rows_xpath
  #@xpath = BacktrackXPath.new(@rows.first).to_xpath.gsub("[@class='']",'')

  last_row = XPath.match(doc3, @xpath).last
  puts '@xpath: ' + @xpath.inspect

  # find the container element
  xpath = @xpath[/^[^\[]+/]
  axpath = xpath.split('/')

  e = XPath.first(doc3, xpath)
  puts 'e: ' + e.to_s

  until (e.nil? or e.to_s.include?(last_row.to_s)) do
    axpath.pop
    e = XPath.first(doc3, axpath.join('/'))
  end

  @cont_xpath = axpath.join('/')

end

Instance Attribute Details

#to_aObject (readonly)

Returns the value of attribute to_a.



10
11
12
# File 'lib/xml_row_finder.rb', line 10

def to_a
  @to_a
end

Instance Method Details

#bodyObject

returns the container element for all rows



74
75
76
# File 'lib/xml_row_finder.rb', line 74

def body()
  Rexle.new(@doc.element(@cont_xpath).xml)
end

#body_xpathObject

returns the xpath pointing to the container element for all rows



80
81
82
# File 'lib/xml_row_finder.rb', line 80

def body_xpath()
  @cont_xpath
end

#rowsObject

returns rows object returned: An array of Nokogiri XML Element object



87
88
89
# File 'lib/xml_row_finder.rb', line 87

def rows()
  @rows
end

#to_xpathObject Also known as: rows_xpath

returns the xpath pointing to the rows



93
94
95
# File 'lib/xml_row_finder.rb', line 93

def to_xpath()
  @xpath
end