Module: MediaartsScraper::Page::CommonTableParser

Included in:
PageBase
Defined in:
lib/mediaarts_scraper/page/common_table_parser.rb

Constant Summary collapse

KEY_SEPARATOR =
"/"

Instance Method Summary collapse

Instance Method Details

#parse_common_key_value_table(table) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/mediaarts_scraper/page/common_table_parser.rb', line 6

def parse_common_key_value_table(table)
  result = {}

  table.xpath("tbody/tr").each do |tr|
    ths = tr.xpath("th")
    tds = tr.xpath("td")

    if ths.count == tds.count
      keys = ths.map(&:text).map(&:strip)

      values = tds.each_with_index.map do |td, i|
        if td.xpath("p").count == 1
          dls = td.xpath("div/div/dl")

          if dls.count > 0
            dls.each do |dl|
              dts = dl.xpath("dt").map(&:text).map(&:strip)
              dds = dl.xpath("dd").map(&:text).map(&:strip)

              if dts.count == dds.count
                dts.each do |dt|
                  dds.each do |dd|
                    result[keys[i] + KEY_SEPARATOR + dt] = dd
                  end
                end
              else
                raise ParseError
              end
            end
          end

          td.xpath("p").first.text.strip
        else
          td.text.strip
        end
      end

      keys.each_with_index do |key, i|
        result[key] = values[i]
      end
    else
      raise ParseError
    end
  end

  result
end

#parse_common_serial_rows_table(table) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/mediaarts_scraper/page/common_table_parser.rb', line 54

def parse_common_serial_rows_table(table)
  header = table.xpath("thead/tr/th").map(&:text).map(&:strip)

  table.xpath("tbody/tr").map do |tr|
    tds = tr.xpath("td")

    data = tds.map { |td|
      child = td.child

      if child
        child.text.strip
      else
        td.text.strip
      end
    }

    tr_result = Hash[*header.zip(data).flatten]

    link_element = tds.detect { |td| td.xpath("a").first }
    tr_result["href"] = link_element.xpath("a").first.attributes["href"].value if link_element

    tr_result
  end
end