Class: XMLRowFinder

Inherits:
Object
  • Object
show all
Defined in:
lib/xml_row_finder.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(raws, debug: false) ⇒ XMLRowFinder

Returns a new instance of XMLRowFinder.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/xml_row_finder.rb', line 12

def initialize(raws, debug: false)

  @debug = debug

  doc = if raws =~ /^http/ then

    nki = Nokorexi.new(url=raws) do |doc1|
      doc1.xpath('//*[@onclick]').each do |e|
        e.attributes['onclick'].value = ''
      end

      doc1.xpath('//*[@onmousedown]').each do |e|
        e.attributes['onmousedown'].value = ''
      end

    end

    nki.to_doc

  else
    Rexle.new(raws)
  end

  @doc = Rexle.new(doc.xml)

  a = []

  doc.root.each_recursive do |e|
    e.attributes.delete
    a << e.backtrack.to_xpath
  end

  @to_a = a2 = a.map {|e| [a.count(e), e] }.uniq
  xpath = a2.max_by(&:first).last

  a3 = xpath.split('/')
  a4 = [xpath]
  p1 = []

  until (a3.length < 1) do
    p1 << a3.pop; a4 << a3.join('/') + "[%s]" % p1.reverse.join('/')
  end

  # using Nokogiri since Rexle has a bug with xpath predicates
  #
  @doc2 = Nokogiri::XML(doc.root.xml)

  a5 = a4[0..-2].map do |xpath2|
    [@doc2.xpath(xpath2).length, xpath2]
  end

  @xpath = a5.reverse.detect {|num, xpath2| num > 1}.last

  last_row = @doc2.xpath(@xpath).last

  # find the container element
  xpath = @xpath[/^[^\[]+/]
  axpath = xpath.split('/')
  e = doc.element xpath

  until (e.xml.include? last_row) do
    axpath.pop
    e = doc.element axpath.join('/')
  end

  @cont_xpath = axpath.join('/')

end

Instance Attribute Details

#to_aObject (readonly)

Returns the value of attribute to_a.



10
11
12
# File 'lib/xml_row_finder.rb', line 10

def to_a
  @to_a
end

Instance Method Details

#bodyObject

returns the container element for all rows



83
84
85
# File 'lib/xml_row_finder.rb', line 83

def body()
  Rexle.new(@doc.element(@cont_xpath).xml)
end

#body_xpathObject

returns the xpath pointing to the container element for all rows



89
90
91
# File 'lib/xml_row_finder.rb', line 89

def body_xpath()
  @cont_xpath
end

#rowsObject

returns rows object returned: An array of Nokogiri XML Element object



96
97
98
# File 'lib/xml_row_finder.rb', line 96

def rows()
  @doc2.xpath @xpath
end

#to_xpathObject Also known as: rows_xpath

returns the xpath pointing to the rows



102
103
104
# File 'lib/xml_row_finder.rb', line 102

def to_xpath()
  @xpath
end