Class: CsvUtils
- Inherits:
-
Object
- Object
- CsvUtils
- Defined in:
- lib/csvutils/version.rb,
lib/csvutils/cut.rb,
lib/csvutils/head.rb,
lib/csvutils/stat.rb,
lib/csvutils/test.rb,
lib/csvutils/split.rb,
lib/csvutils/utils.rb,
lib/csvutils/header.rb
Overview
note: for now CsvUtils is a class!!! NOT a module - change - why? why not?
Constant Summary collapse
- MAJOR =
todo: namespace inside version or something - why? why not??
0
- MINOR =
2
- PATCH =
0
- VERSION =
[MAJOR,MINOR,PATCH].join('.')
Class Method Summary collapse
- .banner ⇒ Object
- .cut(inpath, outpath, *columns, sep: ',') ⇒ Object
-
.head(path, sep: ',', n: 4) ⇒ Object
test or dry run to check if rows can get read/scanned.
-
.header(path, sep: ',', debug: false) ⇒ Object
use header or headers - or use both (with alias)?.
-
.pp_header(headers) ⇒ Object
check: rename to print_headers or prettyprint_header - why? why not?.
- .root ⇒ Object
- .split(path, *columns, sep: ',', &blk) ⇒ Object
- .split_write(inpath, values, chunk) ⇒ Object
- .stat(path, *columns, sep: ',', debug: false) ⇒ Object
-
.test(path, sep: ',') ⇒ Object
test or dry run to check if rows can get read/scanned.
- .version ⇒ Object
Class Method Details
.banner ⇒ Object
16 17 18 |
# File 'lib/csvutils/version.rb', line 16 def self. "csvutils/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" end |
.cut(inpath, outpath, *columns, sep: ',') ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/csvutils/cut.rb', line 8 def self.cut( inpath, outpath, *columns, sep: ',' ) puts "cvscut in: >#{inpath}< out: >#{outpath}<" ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] puts "columns:" pp columns text = File.open( inpath, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8 = { headers: true, col_sep: sep } table = CSV.parse( text, ) ## for convenience - make sure parent folders/directories exist FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath )) ## use wb mode - why? why not? ## assumes same encoding as input? ## fix/todo: better (always) use utf8!!!! ## CSV.open( out_path, 'wb' ) do |out| ## use just "regular" File for output - why? why not? ## downside will not encode comma (for now) if present ("Beethoven, van") ## all values will be unquoted etc. - keep it simple? CSV.open( outpath, 'w:utf-8' ) do |out| out << columns ## for row add headers/columns table.each do |row| values = columns.map { |col| row[col].strip } ## find data for column out << values end end puts 'Done.' end |
.head(path, sep: ',', n: 4) ⇒ Object
test or dry run to check if rows can get read/scanned
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/csvutils/head.rb', line 7 def self.head( path, sep: ',', n: 4 ) i = 0 = { headers: true, col_sep: sep, external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!! } CSV.foreach( path, ) do |row| i += 1 pp row break if i >= n end puts " #{i} rows" end |
.header(path, sep: ',', debug: false) ⇒ Object
use header or headers - or use both (with alias)?
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/csvutils/header.rb', line 6 def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)? # read first line (only) # and parse with csv to get header from csv library itself # # check - if there's an easier or built-in way for the csv library line = File.open( path, 'r:utf-8' ) { |f| f.readline } pp line if debug ## e.g.: # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n" = { col_sep: sep } ## note: do NOT use headers: true to get "plain" data array (no hash records) ## hash record does NOT work for single line/row rows = CSV.parse( line, ) pp rows if debug rows[0] ## return first row end |
.pp_header(headers) ⇒ Object
check: rename to print_headers or prettyprint_header - why? why not?
6 7 8 9 10 11 |
# File 'lib/csvutils/utils.rb', line 6 def self.pp_header( headers ) ## check: rename to print_headers or prettyprint_header - why? why not? puts "#{headers.size} columns:" headers.each_with_index do |header,i| puts " #{i+1}: #{header}" end end |
.root ⇒ Object
20 21 22 |
# File 'lib/csvutils/version.rb', line 20 def self.root File.( File.dirname(File.dirname(File.dirname(__FILE__))) ) end |
.split(path, *columns, sep: ',', &blk) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/csvutils/split.rb', line 8 def self.split( path, *columns, sep: ',', &blk ) puts "cvssplit in: >#{path}<" ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] puts "columns:" pp columns text = File.open( path, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8 ## note: do NOT use headers ## for easy sorting use "plain" array of array for records = { col_sep: sep } data = CSV.parse( text, ) ## todo/check: (auto-) strip (remove all leading and trailing spaces) ## from all values - why? why not? ## check if CSV.parse has an option for it? headers = data.shift ## remove top array item (that is, row with headers) header_mapping = {} headers.each_with_index { | header,i | header_mapping[header]=i } pp header_mapping ## map columns to array indices e.g. ['Season', 'Div'] => [1,2] column_indices = columns.map { |col| header_mapping[col] } pp column_indices ################################################### ## note: sort data by columns (before split) data = data.sort do |row1,row2| res = 0 column_indices.each do |col| res = row1[col] <=> row2[col] if res == 0 end res end chunk = [] data.each_with_index do |row,i| chunk << row next_row = data[i+1] changed = false if next_row.nil? ## end-of-file changed = true else column_indices.each do |col| if row[col] != next_row[col] changed = true break ## out of each column_indices loop end end end if changed puts "save new chunk:" column_values = column_indices.map {|col| row[col] } pp column_values # note: add header(s) row upfront (as first row) to chunk (with unshift) chunk_with_headers = chunk.unshift( headers ) if blk yield( column_values, chunk_with_headers ) else ## auto-save (write-to-file) by default - why? why not? split_write( path, column_values, chunk_with_headers ) end chunk = [] ## reset chunk for next batch of records end end puts 'Done.' end |
.split_write(inpath, values, chunk) ⇒ Object
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/csvutils/split.rb', line 89 def self.split_write( inpath, values, chunk ) basename = File.basename( inpath, '.*' ) dirname = File.dirname( inpath ) ## check/change invalid filename chars ## e.g. change 1990/91 to 1990-91 extraname = values.map {|value| value.tr('/','-')}.join('~') outpath = "#{dirname}/#{basename}_#{extraname}.csv" puts "saving >#{basename}_#{extraname}.csv<..." CSV.open( outpath, 'w:utf-8' ) do |out| chunk.each do |row| out << row end end end |
.stat(path, *columns, sep: ',', debug: false) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/csvutils/stat.rb', line 6 def self.stat( path, *columns, sep: ',', debug: false ) = { headers: true, col_sep: sep, external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!! } values = {} nulls = {} # check 1) nulls/nils (e.g. empty strings ""), # 2) not/appliation or available n/a NA or NaN or ... # 3) missing - e.g. ? i=0 CSV.foreach( path, ) do |row| i += 1 pp row if i == 1 && debug print '.' if i % 100 == 0 ## collect unique values for passed in columns columns.each do |col| value = row[col] ## note: value might be nil!!!!! value = value.strip if value ## use strip - why? why not? report/track trailing spaces? values[col] ||= Hash.new(0) values[col][ value ? value : '<nil>' ] +=1 end ## alway track nulls - why? why not row.each do |col,value| ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?) ## puts "[debug] nil value in row:" ## puts "#{col} = #{value.inspect} : #{value.class.name}" ## end value = value.strip if value ## use strip - why? why not? report/track trailing spaces? if value.nil? nulls[col] ||= Hash.new(0) nulls[col]['nil'] +=1 elsif value.empty? nulls[col] ||= Hash.new(0) nulls[col]['empty'] +=1 elsif ['na', 'n/a', '-'].include?( value.downcase ) nulls[col] ||= Hash.new(0) nulls[col]['na'] +=1 elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not? nulls[col] ||= Hash.new(0) nulls[col]['?'] +=1 else # do nothing; "regular" value end end end puts " #{i} rows" puts puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):" puts " #{nulls.inspect}" puts ## dump headers first (first row with names of columns) headers = header( path, sep: sep, debug: debug ) pp_header( headers ) ## pretty print header columns puts if values.any? ## pretty print (pp) / dump unique values for passed in columns values.each do |col,h| puts " column >#{col}< #{h.size} unique values:" ## sort by name/value for now (not frequency) - change - why? why not? sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] } sorted_values.each do |rec| puts " #{rec[1]} x #{rec[0]}" end end end end |
.test(path, sep: ',') ⇒ Object
test or dry run to check if rows can get read/scanned
7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/csvutils/test.rb', line 7 def self.test( path, sep: ',' ) i = 0 = { headers: true, col_sep: sep, external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!! } CSV.foreach( path, ) do |row| i += 1 print '.' if i % 100 == 0 end puts " #{i} rows" end |
.version ⇒ Object
12 13 14 |
# File 'lib/csvutils/version.rb', line 12 def self.version VERSION end |