Class: CSVKit

Inherits:
Object
  • Object
show all
Defined in:
lib/csvkit/csvkit.rb,
lib/csvkit/version.rb,
lib/csvkit/middleware.rb

Defined Under Namespace

Classes: Middleware

Constant Summary collapse

VERSION =
"0.1.4"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(body_content, options = {}) ⇒ CSVKit

Returns a new instance of CSVKit.



9
10
11
12
13
14
# File 'lib/csvkit/csvkit.rb', line 9

def initialize(body_content, options = {})
  @options = {:show_link => false}.merge options

  @content = body_content

end

Instance Attribute Details

#contentObject

Returns the value of attribute content.



5
6
7
# File 'lib/csvkit/csvkit.rb', line 5

def content
  @content
end

#optionsObject (readonly)

Returns the value of attribute options.



7
8
9
# File 'lib/csvkit/csvkit.rb', line 7

def options
  @options
end

Instance Method Details

#clean_cell_string(cell_string) ⇒ Object



46
47
48
49
50
51
52
53
# File 'lib/csvkit/csvkit.rb', line 46

def clean_cell_string(cell_string)
  cell_string = cell_string.gsub(/[[:space:]]/, ' ')
  cell_string = cell_string.gsub("\n", ' ')
  cell_string = cell_string.gsub('"', '\"')
  cell_string = cell_string.gsub(/(\s){2,}/m, '\1')
  cell_string = cell_string.gsub(/[\,\$]/, '')
  is_numeric?(cell_string) ? cell_string.to_b : cell_string.strip
end

#header_with_colspan?(cell) ⇒ Boolean

Returns:

  • (Boolean)


55
56
57
# File 'lib/csvkit/csvkit.rb', line 55

def header_with_colspan?(cell)
  'th' == cell.name && cell.key?('colspan')
end

#is_numeric?(string) ⇒ Boolean

Returns:

  • (Boolean)


59
60
61
62
# File 'lib/csvkit/csvkit.rb', line 59

def is_numeric?(string)
  return true if self =~ /^\d+$/
  true if Float(self) rescue false
end

#to_csvObject



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/csvkit/csvkit.rb', line 16

def to_csv

  doc = Nokogiri::HTML(@content)

  tsv_str = CSV.generate(col_sep: "\t", headers: :first_row, encoding: 'utf-8') do |tsv|
    doc.xpath('//table//tr').each do |row|
      tsv_row = []
      row.xpath('td | th').each do |cell|
        repeat = header_with_colspan?(cell) ? cell['colspan'].to_i : 1
        repeat.times do
          # if a link show href instead of text.
          if (a_link = cell.search('a').first) && options[:show_link]
            text = a_link['href']
          else
            text = cell.text
          end
          tsv_row << clean_cell_string(text)
        end
      end
      tsv << tsv_row
    end
  end

  raise "command failed" if tsv_str.to_s.strip.empty?

  write_content = "\xEF\xBB\xBF".encode!(Encoding::UTF_16LE, Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
  write_content += tsv_str.encode!(Encoding::UTF_16LE, Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
  return write_content
end