Module: RemoteTable::ProcessedByNokogiri

Defined in:
lib/remote_table/processed_by_nokogiri.rb

Overview

Mixed in to process XML and XHTML.

Constant Summary collapse

WHITESPACE =
/\s+/
SINGLE_SPACE =
' '
SOFT_HYPHEN =
'­'

Instance Method Summary collapse

Instance Method Details

#_eachObject

Yield each row using Nokogiri.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/remote_table/processed_by_nokogiri.rb', line 9

def _each
  require 'nokogiri'
  require 'cgi'
  
  # save this to a local var because we modify it in the loop
  current_headers = headers

  unless row_css or row_xpath
    raise ::ArgumentError, "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML"
  end

  delete_harmful!
  transliterate_whole_file_to_utf8!
  
  xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, RemoteTable::EXTERNAL_ENCODING)
  (row_css ? xml.css(row_css) : xml.xpath(row_xpath)).each do |row|
    some_value_present = false
    values = if column_css
      row.css column_css
    elsif column_xpath
      row.xpath column_xpath
    else
      [row]
    end.map do |cell|
      memo = cell.content.dup
      memo = assume_utf8 memo
      memo.gsub! WHITESPACE, SINGLE_SPACE
      memo.strip!
      if not some_value_present and not keep_blank_rows and memo.present?
        some_value_present = true
      end
      memo
    end
    if current_headers == :first_row
      current_headers = values.select(&:present?)
      next
    end
    if keep_blank_rows or some_value_present
      if not headers
        yield values
      else
        yield zip(current_headers, values)
      end
    end
  end
ensure
  local_copy.cleanup
end