Class: MediaWikiTableScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/mediawiki_table_scraper.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ MediaWikiTableScraper

Returns a new instance of MediaWikiTableScraper.



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/mediawiki_table_scraper.rb', line 13

def initialize(url)

  doc = Nokorexi.new(url).to_doc

  tables = doc.root.css('.wikitable')
  # Fetch the records as an array of hash records for each table

  @tables = tables.map do |table|

    rows = table.xpath 'tr'

    # fetch the column names
    labels = rows.shift.xpath 'th/text()'
    names = labels.map {|x| x.downcase.to_sym }

    a = rows.map do |row| 

      row.xpath('td').map do |x|

        if x.has_elements? then

          x.children.map do |c| 
            c.is_a?(String) ? c : c.xml.gsub(/<\/?\w+[^>]*>/,'')
          end.join ' '

        else
          x.text.to_s
        end

      end

    end

    a2 = a.map {|rows| names.zip(rows).to_h }

  end
end

Instance Attribute Details

#tablesObject (readonly)

Returns the value of attribute tables.



11
12
13
# File 'lib/mediawiki_table_scraper.rb', line 11

def tables
  @tables
end

Instance Method Details

#to_aObject



51
52
53
# File 'lib/mediawiki_table_scraper.rb', line 51

def to_a()
  @tables
end