Class: CsvReader::ParserStd

Inherits:
Object
  • Object
show all
Defined in:
lib/csvreader/parser_std.rb

Constant Summary collapse

DOUBLE_QUOTE =

char constants

"\""
SINGLE_QUOTE =
"'"
BACKSLASH =

use BACKSLASH_ESCAPE ??

"\\"
COMMENT_HASH =

use COMMENT1 or COMMENT_HASH or HASH or ??

"#"
COMMENT_PERCENT =

use COMMENT2 or COMMENT_PERCENT or PERCENT or ??

"%"
DIRECTIVE =

use a different name e.g. AT or ??

"@"
SPACE =

s == ASCII 32 (dec) = (Space)

" "
TAB =

t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)

"\t"
LF =

n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)

"\n"
CR =

r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)

"\r"
SEPARATORS =
",;|^:"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sep: ',', null: ['\N', 'NA'], numeric: false, nan: nil, space: nil, hashtag: false) ⇒ ParserStd

todo/check:

null values - include NA - why? why not?
    make null values case sensitive or add an option for case sensitive
    or better allow a proc as option for checking too!!!


51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/csvreader/parser_std.rb', line 51

def initialize( sep:      ',',
                null:     ['\N', 'NA'],  ## note: set to nil for no null vales / not availabe (na)
                numeric:  false,   ## (auto-)convert all non-quoted values to float
                nan:      nil,      ## note: only if numeric - set mappings for Float::NAN (not a number) values
                space:    nil,
                hashtag:  false
              )
  @config = {}   ## todo/fix: change config to proper dialect class/struct - why? why not?

  check_sep( sep )
  @config[:sep]     = sep

  ## note: null values must get handled by parser
  ##   only get checked for unquoted strings (and NOT for quoted strings)
  ##   "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
  @config[:null]    = null   ## null values
  @config[:numeric] = numeric
  @config[:nan]     = nan   # not a number (NaN) e.g. Float::NAN

  ## e.g. treat/convert char to space e.g. _-+• etc
  ##   Man_Utd   => Man Utd
  ##  or use it for leading and trailing spaces without quotes
  ##  todo/check: only use for unquoted values? why? why not?
  @config[:space]   = space

  ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
  ##  e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
  ##  do NOT treat # as a comment (always use % for now)
  @config[:hashtag] = hashtag

  @meta  = nil     ## no meta data block   (use empty hash {} - why? why not?)
end

Instance Attribute Details

#configObject (readonly)

todo/fix: change config to proper dialect class/struct - why? why not?



43
44
45
# File 'lib/csvreader/parser_std.rb', line 43

def config
  @config
end

#metaObject (readonly)

Returns the value of attribute meta.



44
45
46
# File 'lib/csvreader/parser_std.rb', line 44

def meta
  @meta
end

Class Method Details

.build_loggerObject

add simple logger with debug flag/switch

use Parser.debug = true   # to turn on

todo/fix: use logutils instead of std logger - why? why not?


32
33
34
35
36
# File 'lib/csvreader/parser_std.rb', line 32

def self.build_logger()
  l = Logger.new( STDOUT )
  l.level = :info    ## set to :info on start; note: is 0 (debug) by default
  l
end

.loggerObject



37
# File 'lib/csvreader/parser_std.rb', line 37

def self.logger() @@logger ||= build_logger; end

Instance Method Details

#check_sep(sep) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
# File 'lib/csvreader/parser_std.rb', line 87

def check_sep( sep )
  ## note: parse does NOT support space or tab as separator!!
  ##    leading and trailing space or tab (whitespace) gets by default trimmed
  ##      unless quoted (or alternative space char used e.g. _-+ if configured)

  if SEPARATORS.include?( sep )
     ## everything ok
  else
    raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
  end
end

#hashtag=(value) ⇒ Object



110
# File 'lib/csvreader/parser_std.rb', line 110

def hashtag=( value )     @config[:hashtag]=value; end

#loggerObject



38
# File 'lib/csvreader/parser_std.rb', line 38

def logger()  self.class.logger; end

#nan=(value) ⇒ Object



108
# File 'lib/csvreader/parser_std.rb', line 108

def nan=( value )         @config[:nan]=value; end

#null=(value) ⇒ Object



106
# File 'lib/csvreader/parser_std.rb', line 106

def null=( value )        @config[:null]=value; end

#numeric=(value) ⇒ Object



107
# File 'lib/csvreader/parser_std.rb', line 107

def numeric=( value )     @config[:numeric]=value; end

#parse(str_or_readable, sep: , &block) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/csvreader/parser_std.rb', line 115

def parse( str_or_readable, sep: config[:sep], &block )

  check_sep( sep )

  ## note: data - will wrap either a String or IO object passed in data
  ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)

  ##   make sure data (string or io) is a wrapped into Buffer!!!!!!
  if str_or_readable.is_a?( Buffer )    ### allow (re)use of Buffer if managed from "outside"
    input = str_or_readable
  else
    input = Buffer.new( str_or_readable )
  end

  if block_given?
    parse_lines( input, sep: sep, &block )
  else
    records = []

    parse_lines( input, sep: sep ) do |record|
      records << record
    end

    records
  end
end

#sep=(value) ⇒ Object

config convenience helpers

e.g. use like  Csv.defaultl.null = '\N'   etc.   instead of
               Csv.default.config[:null] = '\N'


104
# File 'lib/csvreader/parser_std.rb', line 104

def sep=( value )         check_sep( value );  @config[:sep]=value; end

#space=(value) ⇒ Object



109
# File 'lib/csvreader/parser_std.rb', line 109

def space=( value )       @config[:space]=value; end