Class: CsvUtils

Inherits:
Object
  • Object
show all
Defined in:
lib/csvutils/version.rb,
lib/csvutils/cut.rb,
lib/csvutils/head.rb,
lib/csvutils/stat.rb,
lib/csvutils/test.rb,
lib/csvutils/split.rb,
lib/csvutils/utils.rb,
lib/csvutils/header.rb

Overview

note: for now CsvUtils is a class!!! NOT a module - change - why? why not?

Constant Summary collapse

MAJOR =

todo: namespace inside version or something - why? why not??

0
MINOR =
3
PATCH =
0
VERSION =
[MAJOR,MINOR,PATCH].join('.')

Class Method Summary collapse

Class Method Details



16
17
18
# File 'lib/csvutils/version.rb', line 16

def self.banner
  "csvutils/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
end

.csv_row(*values, sep: ',') ⇒ Object

(simple) helper for “csv-encoding” values / row

todo: check for newline in value too? why? why not?


18
19
20
21
22
23
24
25
26
27
# File 'lib/csvutils/utils.rb', line 18

def self.csv_row( *values, sep: ',' )
  values.map do |value|
     if value && (value.index( sep ) || value.index('"'))
       ## double quotes and enclose in double qoutes
       value = %Q{"#{value.gsub('"', '""')}"}
     else
       value
     end
  end
end

.cut(path, *columns, output: path, sep: ',') ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/csvutils/cut.rb', line 8

def self.cut( path, *columns, output: path, sep: ',' )

  inpath  = path
  outpath = output   # note: output defaults to inpath (overwrites datafile in-place!!!)

  puts "cvscut in: >#{inpath}<  out: >#{outpath}<"

  ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
  puts "columns:"
  pp columns

  csv_options = { sep: sep }

  recs = CsvHash.read( inpath, csv_options )


  ## for convenience - make sure parent folders/directories exist
  FileUtils.mkdir_p( File.dirname( outpath ))  unless Dir.exists?( File.dirname( outpath ))


  ## note:
  ##  todo/fix: add two trailing spaces for pretty printing - why? why not?
  File.open( outpath, 'w:utf-8' ) do |out|
    out << csv_row( *columns, sep: sep ).join( sep )   ## for row add headers/columns
    out << "\n"
    recs.each do |rec|
      values = columns.map { |col| rec[col] }  ## find data for column
      out << csv_row( *values, sep: sep ).join( sep )
      out << "\n"
    end
  end

  puts 'Done.'
end

.head(path, sep: ',', n: 4) ⇒ Object

test or dry run to check if rows can get read/scanned



7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/csvutils/head.rb', line 7

def self.head( path, sep: ',', n: 4 )
  i = 0
  csv_options = { sep: sep }

  CsvHash.foreach( path, csv_options ) do |rec|
    i += 1

    pp rec

    break if i >= n
  end

  puts " #{i} records"
end

.header(path, sep: ',', debug: false) ⇒ Object

use header or headers - or use both (with alias)?



6
7
8
9
10
11
12
13
14
# File 'lib/csvutils/header.rb', line 6

def self.header( path, sep: ',', debug: false )   ## use header or headers - or use both (with alias)?
  row = CsvReader.header( path, sep: sep )

  pp row   if debug
  ## e.g.:
  #  "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"

  row
end

.pp_header(headers) ⇒ Object

check: rename to print_headers or prettyprint_header - why? why not?



6
7
8
9
10
11
# File 'lib/csvutils/utils.rb', line 6

def self.pp_header( headers )  ## check: rename to print_headers or prettyprint_header - why? why not?
  puts "#{headers.size} columns:"
  headers.each_with_index do |header,i|
    puts "  #{i+1}: #{header}"
  end
end

.rootObject



20
21
22
# File 'lib/csvutils/version.rb', line 20

def self.root
  File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
end

.split(path, *columns, sep: ',', &blk) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/csvutils/split.rb', line 8

def self.split( path, *columns, sep: ',', &blk )

  puts "cvssplit in: >#{path}<"

  ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
  puts "columns:"
  pp columns

  ## note: do NOT use headers
  ##   for easy sorting use "plain" array of array for records
  csv_options = { sep: sep }

  data = CsvReader.read( path, csv_options )

  ## todo/check: (auto-) strip (remove all leading and trailing spaces)
  ##     from all values - why? why not?
  ##   check if CSV.parse has an option for it?

  headers = data.shift   ## remove top array item (that is, row with headers)

  header_mapping = {}
  headers.each_with_index  { | header,i | header_mapping[header]=i }
  pp header_mapping

  ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
  column_indices = columns.map { |col| header_mapping[col] }
  pp column_indices


  ###################################################
  ## note: sort data by columns (before split)
  data = data.sort do |row1,row2|
     res = 0
     column_indices.each do |col|
       res = row1[col] <=> row2[col]    if res == 0
     end
     res
  end

  chunk = []
  data.each_with_index do |row,i|
    chunk << row

    next_row = data[i+1]

    changed = false
    if next_row.nil?   ## end-of-file
      changed = true
    else
      column_indices.each do |col|
        if row[col] != next_row[col]
           changed = true
           break   ## out of each column_indices loop
         end
      end
    end

    if changed
      puts "save new chunk:"
      column_values = column_indices.map {|col| row[col] }
      pp column_values

      # note: add header(s) row upfront (as first row) to chunk (with unshift)
      chunk_with_headers = chunk.unshift( headers )
      if blk
        yield( column_values, chunk_with_headers )
      else
        ## auto-save (write-to-file) by default - why? why not?
        split_write( path, column_values, chunk_with_headers, sep: sep )
      end

      chunk = []   ## reset chunk for next batch of records
    end
  end

  puts 'Done.'
end

.split_write(inpath, values, chunk, sep:) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/csvutils/split.rb', line 87

def self.split_write( inpath, values, chunk, sep: )
  basename = File.basename( inpath, '.*' )
  dirname  = File.dirname( inpath )

  ## check/change invalid filename chars
  ##  e.g. change 1990/91 to 1990-91
  extraname = values.map {|value| value.tr('/','-')}.join('~')

  outpath = "#{dirname}/#{basename}_#{extraname}.csv"
  puts "saving >#{basename}_#{extraname}.csv<..."

  File.open( outpath, 'w:utf-8' ) do |out|
    chunk.each do |row|
      out << csv_row( *row, sep: sep ).join( sep )
      out << "\n"
    end
  end
end

.stat(path, *columns, sep: ',', debug: false) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/csvutils/stat.rb', line 6

def self.stat( path, *columns, sep: ',', debug: false )

  csv_options = { sep: sep }

  values = {}
  nulls  = {}
  # check 1) nulls/nils (e.g. empty strings ""),
  #       2) not/appliation or available  n/a NA or NaN or ...
  #       3) missing - e.g. ?

  i=0
  CsvHash.foreach( path, csv_options ) do |rec|
    i += 1

    pp rec    if i == 1 && debug

    print '.' if i % 100 == 0

    ## collect unique values for passed in columns
    columns.each do |col|
      value = rec[col]    ## note: value might be nil!!!!!

      values[col] ||= Hash.new(0)
      values[col][ value ? value : '<nil>' ] +=1
    end

    ## alway track nulls - why? why not
    rec.each do |col,value|
      ## if value.nil?    ## todo/check - nil value possible (not always empty string - why? why not?)
      ##   puts "[debug] nil value in row:"
      ##   puts "#{col} = #{value.inspect} : #{value.class.name}"
      ## end

      if value.nil?
        nulls[col] ||= Hash.new(0)
        nulls[col]['nil'] +=1
      elsif value.empty?
        nulls[col] ||= Hash.new(0)
        nulls[col]['empty'] +=1
      elsif ['na', 'n/a', '-'].include?( value.downcase )
        nulls[col] ||= Hash.new(0)
        nulls[col]['na'] +=1
      elsif value == '?'    ## check for (?) e.g. value.include?( '(?)') - why? why not?
        nulls[col] ||= Hash.new(0)
        nulls[col]['?'] +=1
      else
        # do nothing; "regular" value
      end
    end
  end

  puts " #{i} rows"
  puts
  puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
  puts "   #{nulls.inspect}"
  puts

  ## dump headers first (first row with names of columns)
  headers = header( path, sep: sep, debug: debug )
  pp_header( headers )  ## pretty print header columns
  puts

  if values.any?
     ## pretty print (pp) / dump unique values for passed in columns
     values.each do |col,h|
       puts " column >#{col}< #{h.size} unique values:"
       ## sort by name/value for now (not frequency) - change - why? why not?
       sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
       sorted_values.each do |rec|
         puts "   #{rec[1]} x  #{rec[0]}"
       end
     end
  end
end

.test(path, sep: ',') ⇒ Object

test or dry run to check if rows can get read/scanned



7
8
9
10
11
12
13
14
15
16
17
# File 'lib/csvutils/test.rb', line 7

def self.test( path, sep: ',' )
  i = 0
  csv_options = { sep: sep }

  CsvHash.foreach( path, csv_options ) do |rec|
    i += 1
    print '.' if i % 100 == 0
  end

  puts " #{i} rows"
end

.versionObject



12
13
14
# File 'lib/csvutils/version.rb', line 12

def self.version
  VERSION
end