Class: CsvUtils

Inherits:

Object

Object
CsvUtils

show all

Defined in:: lib/csvutils/version.rb,
lib/csvutils/cut.rb,
lib/csvutils/head.rb,
lib/csvutils/stat.rb,
lib/csvutils/test.rb,
lib/csvutils/split.rb,
lib/csvutils/utils.rb,
lib/csvutils/header.rb

Overview

note: for now CsvUtils is a class!!! NOT a module - change - why? why not?

Constant Summary collapse

MAJOR = todo: namespace inside version or something - why? why not??

MINOR =

PATCH =

VERSION =

[MAJOR,MINOR,PATCH].join('.')

Class Method Summary collapse

.banner ⇒ Object
.cut(inpath, outpath, *columns, sep: ',') ⇒ Object
.head(path, sep: ',', n: 4) ⇒ Object

test or dry run to check if rows can get read/scanned.
.header(path, sep: ',', debug: false) ⇒ Object

use header or headers - or use both (with alias)?.
.pp_header(headers) ⇒ Object

check: rename to print_headers or prettyprint_header - why? why not?.
.root ⇒ Object
.split(path, *columns, sep: ',', &blk) ⇒ Object
.split_write(inpath, values, chunk) ⇒ Object
.stat(path, *columns, sep: ',', debug: false) ⇒ Object
.test(path, sep: ',') ⇒ Object

test or dry run to check if rows can get read/scanned.
.version ⇒ Object

Class Method Details



16
17
18

# File 'lib/csvutils/version.rb', line 16

def self.banner
  "csvutils/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
end

.cut(inpath, outpath, *columns, sep: ',') ⇒ `Object`

# File 'lib/csvutils/cut.rb', line 8

def self.cut( inpath, outpath, *columns, sep: ',' )

  puts "cvscut in: >#{inpath}<  out: >#{outpath}<"

  ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]

  puts "columns:"
  pp columns

  text = File.open( inpath, 'r:utf-8' ).read   ## note: make sure to use (assume) utf-8


  csv_options = { headers: true,
                  col_sep: sep }

  table = CSV.parse( text, csv_options )


  ## for convenience - make sure parent folders/directories exist

  FileUtils.mkdir_p( File.dirname( outpath ))  unless Dir.exists?( File.dirname( outpath ))

  ## use wb mode - why? why not?

  ##   assumes same encoding as input?

  ##   fix/todo: better (always) use utf8!!!!

  ## CSV.open( out_path, 'wb' ) do |out|


  ## use just "regular" File for output - why? why not?

  ##    downside will not encode comma (for now) if present ("Beethoven, van")

  ##      all values will be unquoted etc. - keep it simple?


  CSV.open( outpath, 'w:utf-8' ) do |out|
    out << columns   ## for row add headers/columns

    table.each do |row|
      values = columns.map { |col| row[col].strip }  ## find data for column

      out << values
    end
  end

  puts 'Done.'
end

.head(path, sep: ',', n: 4) ⇒ `Object`

test or dry run to check if rows can get read/scanned

# File 'lib/csvutils/head.rb', line 7

def self.head( path, sep: ',', n: 4 )
  i = 0
  csv_options = { headers: true,
                  col_sep: sep,
                  external_encoding: 'utf-8'  ## note:  always (auto-)add utf-8 external encoding!!!

                 }

  CSV.foreach( path, csv_options ) do |row|
    i += 1

    pp row

    break if i >= n
  end

  puts " #{i} rows"
end

.header(path, sep: ',', debug: false) ⇒ `Object`

use header or headers - or use both (with alias)?

# File 'lib/csvutils/header.rb', line 6

def self.header( path, sep: ',', debug: false )   ## use header or headers - or use both (with alias)?


  # read first line (only)

  #  and parse with csv to get header from csv library itself

  #

  #  check - if there's an easier or built-in way for the csv library


  line = File.open( path, 'r:utf-8' ) { |f| f.readline }

  pp line   if debug
  ## e.g.:

  #  "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"


  csv_options = {  col_sep: sep }

  ## note: do NOT use headers: true to get "plain" data array (no hash records)

  ##   hash record does NOT work for single line/row

  rows = CSV.parse( line, csv_options )
  pp rows   if debug
  rows[0]   ## return first row

end

.pp_header(headers) ⇒ `Object`

check: rename to print_headers or prettyprint_header - why? why not?

# File 'lib/csvutils/utils.rb', line 6

def self.pp_header( headers )  ## check: rename to print_headers or prettyprint_header - why? why not?

  puts "#{headers.size} columns:"
  headers.each_with_index do |header,i|
    puts "  #{i+1}: #{header}"
  end
end

.root ⇒ `Object`



20
21
22

# File 'lib/csvutils/version.rb', line 20

def self.root
  File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
end

.split(path, *columns, sep: ',', &blk) ⇒ `Object`

# File 'lib/csvutils/split.rb', line 8

def self.split( path, *columns, sep: ',', &blk )

  puts "cvssplit in: >#{path}<"

  ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]

  puts "columns:"
  pp columns

  text = File.open( path, 'r:utf-8' ).read   ## note: make sure to use (assume) utf-8


  ## note: do NOT use headers

  ##   for easy sorting use "plain" array of array for records

  csv_options = { col_sep: sep }

  data = CSV.parse( text, csv_options )

  ## todo/check: (auto-) strip (remove all leading and trailing spaces)

  ##     from all values - why? why not?

  ##   check if CSV.parse has an option for it?


  headers = data.shift   ## remove top array item (that is, row with headers)


  header_mapping = {}
  headers.each_with_index  { | header,i | header_mapping[header]=i }
  pp header_mapping

  ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]

  column_indices = columns.map { |col| header_mapping[col] }
  pp column_indices


  ###################################################

  ## note: sort data by columns (before split)

  data = data.sort do |row1,row2|
     res = 0
     column_indices.each do |col|
       res = row1[col] <=> row2[col]    if res == 0
     end
     res
  end

  chunk = []
  data.each_with_index do |row,i|
    chunk << row

    next_row = data[i+1]

    changed = false
    if next_row.nil?   ## end-of-file

      changed = true
    else
      column_indices.each do |col|
        if row[col] != next_row[col]
           changed = true
           break   ## out of each column_indices loop

         end
      end
    end

    if changed
      puts "save new chunk:"
      column_values = column_indices.map {|col| row[col] }
      pp column_values

      # note: add header(s) row upfront (as first row) to chunk (with unshift)

      chunk_with_headers = chunk.unshift( headers )
      if blk
        yield( column_values, chunk_with_headers )
      else
        ## auto-save (write-to-file) by default - why? why not?

        split_write( path, column_values, chunk_with_headers )
      end

      chunk = []   ## reset chunk for next batch of records

    end
  end

  puts 'Done.'
end

.split_write(inpath, values, chunk) ⇒ `Object`

# File 'lib/csvutils/split.rb', line 89

def self.split_write( inpath, values, chunk )
  basename = File.basename( inpath, '.*' )
  dirname  = File.dirname( inpath )

  ## check/change invalid filename chars

  ##  e.g. change 1990/91 to 1990-91

  extraname = values.map {|value| value.tr('/','-')}.join('~')

  outpath = "#{dirname}/#{basename}_#{extraname}.csv"
  puts "saving >#{basename}_#{extraname}.csv<..."

  CSV.open( outpath, 'w:utf-8' ) do |out|
    chunk.each do |row|
      out << row
    end
  end
end

.stat(path, *columns, sep: ',', debug: false) ⇒ `Object`

# File 'lib/csvutils/stat.rb', line 6

def self.stat( path, *columns, sep: ',', debug: false )

  csv_options = { headers: true,
                  col_sep: sep,
                  external_encoding: 'utf-8'  ## note:  always (auto-)add utf-8 external encoding!!!

                }

  values = {}
  nulls  = {}
  # check 1) nulls/nils (e.g. empty strings ""),

  #       2) not/appliation or available  n/a NA or NaN or ...

  #       3) missing - e.g. ?


  i=0
  CSV.foreach( path, csv_options ) do |row|
    i += 1

    pp row    if i == 1 && debug

    print '.' if i % 100 == 0

    ## collect unique values for passed in columns

    columns.each do |col|
      value = row[col]    ## note: value might be nil!!!!!

      value = value.strip   if value   ## use strip - why? why not? report/track trailing spaces?


      values[col] ||= Hash.new(0)
      values[col][ value ? value : '<nil>' ] +=1
    end

    ## alway track nulls - why? why not

    row.each do |col,value|
      ## if value.nil?    ## todo/check - nil value possible (not always empty string - why? why not?)

      ##   puts "[debug] nil value in row:"

      ##   puts "#{col} = #{value.inspect} : #{value.class.name}"

      ## end


      value = value.strip   if value   ## use strip - why? why not? report/track trailing spaces?

      if value.nil?
        nulls[col] ||= Hash.new(0)
        nulls[col]['nil'] +=1
      elsif value.empty?
        nulls[col] ||= Hash.new(0)
        nulls[col]['empty'] +=1
      elsif ['na', 'n/a', '-'].include?( value.downcase )
        nulls[col] ||= Hash.new(0)
        nulls[col]['na'] +=1
      elsif value == '?'    ## check for (?) e.g. value.include?( '(?)') - why? why not?

        nulls[col] ||= Hash.new(0)
        nulls[col]['?'] +=1
      else
        # do nothing; "regular" value

      end
    end
  end

  puts " #{i} rows"
  puts
  puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
  puts "   #{nulls.inspect}"
  puts

  ## dump headers first (first row with names of columns)

  headers = header( path, sep: sep, debug: debug )
  pp_header( headers )  ## pretty print header columns

  puts

  if values.any?
     ## pretty print (pp) / dump unique values for passed in columns

     values.each do |col,h|
       puts " column >#{col}< #{h.size} unique values:"
       ## sort by name/value for now (not frequency) - change - why? why not?

       sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
       sorted_values.each do |rec|
         puts "   #{rec[1]} x  #{rec[0]}"
       end
     end
  end
end

.test(path, sep: ',') ⇒ `Object`

test or dry run to check if rows can get read/scanned

# File 'lib/csvutils/test.rb', line 7

def self.test( path, sep: ',' )
  i = 0
  csv_options = { headers: true,
                  col_sep: sep,
                  external_encoding: 'utf-8'  ## note:  always (auto-)add utf-8 external encoding!!!

                 }

  CSV.foreach( path, csv_options ) do |row|
    i += 1
    print '.' if i % 100 == 0
  end

  puts " #{i} rows"
end

.version ⇒ `Object`



12
13
14

# File 'lib/csvutils/version.rb', line 12

def self.version
  VERSION
end

Class: CsvUtils

Overview

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.banner ⇒ Object

.cut(inpath, outpath, *columns, sep: ',') ⇒ Object

.head(path, sep: ',', n: 4) ⇒ Object

.header(path, sep: ',', debug: false) ⇒ Object

.pp_header(headers) ⇒ Object

.root ⇒ Object

.split(path, *columns, sep: ',', &blk) ⇒ Object

.split_write(inpath, values, chunk) ⇒ Object

.stat(path, *columns, sep: ',', debug: false) ⇒ Object

.test(path, sep: ',') ⇒ Object

.version ⇒ Object

.banner ⇒ `Object`