Class: CsvReader::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/csvreader/parser.rb

Constant Summary collapse

DOUBLE_QUOTE =

char constants

"\""
COMMENT =

use COMMENT_HASH or HASH or ??

"#"
SPACE =
" "
TAB =
"\t"
LF =

0A (hex) 10 (dec)

"\n"
CR =

0D (hex) 13 (dec)

"\r"

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.foreach(path, &block) ⇒ Object



44
45
46
47
48
49
# File 'lib/csvreader/parser.rb', line 44

def self.foreach( path, &block )
  parser = new
  File.open( path, 'r:bom|utf-8' ) do |file|
    parser.foreach( file, &block )
  end
end

.parse(data) ⇒ Object



16
17
18
19
20
21
22
# File 'lib/csvreader/parser.rb', line 16

def self.parse( data )
  puts "parse:"
  pp data

  parser = new
  parser.parse( data )
end

.parse_line(data) ⇒ Object



24
25
26
27
28
29
30
31
32
33
# File 'lib/csvreader/parser.rb', line 24

def self.parse_line( data )
  puts "parse_line:"

  parser = new
  records = parser.parse( data, limit: 1 )

  ## unwrap record if empty return nil - why? why not?
  ##  return empty record e.g. [] - why? why not?
  records.size == 0 ? nil : records.first
end

.parse_lines(data, &block) ⇒ Object



51
52
53
54
# File 'lib/csvreader/parser.rb', line 51

def self.parse_lines( data, &block )
  parser = new
  parser.parse_lines( data, &block )
end

.read(path) ⇒ Object



37
38
39
40
41
42
# File 'lib/csvreader/parser.rb', line 37

def self.read( path )
  parser = new
  File.open( path, 'r:bom|utf-8' ) do |file|
    parser.parse( file )
  end
end

Instance Method Details

#foreach(io_maybe, trim: true, comments: true, blanks: true, &block) ⇒ Object



242
243
244
245
246
# File 'lib/csvreader/parser.rb', line 242

def foreach( io_maybe, trim: true,
                 comments: true,
                 blanks: true,    &block )
  parse_lines( io_maybe, trim: trim, comments: comments, blanks: blanks, &block )
end

#parse(io_maybe, trim: true, comments: true, blanks: true, limit: nil) ⇒ Object



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/csvreader/parser.rb', line 225

def parse( io_maybe, trim: true,
               comments: true,
               blanks: true,
               limit: nil )
  records = []

  parse_lines( io_maybe, trim: trim, comments: comments, blanks: blanks ) do |record|
    records << record

    ## set limit to 1 for processing "single" line (that is, get one record)
    return records   if limit && limit >= records.size
  end

  records
end

#parse_field(io, trim: true) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/csvreader/parser.rb', line 60

def parse_field( io, trim: true )
  value = ""
  value << parse_spaces( io ) ## add leading spaces

  if (c=io.peek; c=="," || c==LF || c==CR || io.eof?) ## empty field
    value = value.strip    if trim ## strip all spaces
     ## return value; do nothing
  elsif io.peek == DOUBLE_QUOTE
    puts "start double_quote field - value >#{value}<"
    value = value.strip   ## note always strip/trim leading spaces in quoted value

    puts "start double_quote field - peek >#{io.peek}< (#{io.peek.ord})"
    io.getc  ## eat-up double_quote

    loop do
      while (c=io.peek; !(c==DOUBLE_QUOTE || io.eof?))
        value << io.getc   ## eat-up everything unit quote (")
      end

      break if io.eof?

      io.getc ## eat-up double_quote

      if io.peek == DOUBLE_QUOTE  ## doubled up quote?
        value << io.getc   ## add doube quote and continue!!!!
      else
        break
      end
    end

    ## note: always eat-up all trailing spaces (" ") and tabs (\t)
    skip_spaces( io )
    puts "end double_quote field - peek >#{io.peek}< (#{io.peek.ord})"
  else
    puts "start reg field - peek >#{io.peek}< (#{io.peek.ord})"
    ## consume simple value
    ##   until we hit "," or "\n" or "\r"
    ##    note: will eat-up quotes too!!!
    while (c=io.peek; !(c=="," || c==LF || c==CR || io.eof?))
      puts "  add char >#{io.peek}< (#{io.peek.ord})"
      value << io.getc   ## eat-up all spaces (" ") and tabs (\t)
    end
    value = value.strip    if trim ## strip all spaces
    puts "end reg field - peek >#{io.peek}< (#{io.peek.ord})"
  end

  value
end

#parse_lines(io_maybe, trim: true, comments: true, blanks: true, &block) ⇒ Object



176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/csvreader/parser.rb', line 176

def parse_lines( io_maybe, trim: true,
                           comments: true,
                           blanks: true,   &block )

  ## find a better name for io_maybe
  ##   make sure io is a wrapped into BufferIO!!!!!!
  if io_maybe.is_a?( BufferIO )    ### allow (re)use of BufferIO if managed from "outside"
    io = io_maybe
  else
    io = BufferIO.new( io_maybe )
  end


  loop do
    break if io.eof?

    ## hack: use own space buffer for peek( x ) lookahead (more than one char)
    ## check for comments or blank lines
    if comments || blanks
      spaces = parse_spaces( io )
    end

    if comments && io.peek == COMMENT        ## comment line
      puts "skipping comment - peek >#{io.peek}< (#{io.peek.ord})"
      skip_until_eol( io )
      skip_newlines( io )
    elsif blanks && (c=io.peek; c==LF || c==CR || io.eof?)
      puts "skipping blank - peek >#{io.peek}< (#{io.peek.ord})"
      skip_newlines( io )
    else  # undo (ungetc spaces)
      puts "start record - peek >#{io.peek}< (#{io.peek.ord})"

      if comments || blanks
        ## note: MUST ungetc in "reverse" order
        ##   ##   buffer is last in/first out queue!!!!
        spaces.reverse.each_char { |space| io.ungetc( space ) }
      end

      record = parse_record( io, trim: trim )

      ## note: requires block - enforce? how? why? why not?
      block.call( record )   ## yield( record )
    end
  end  # loop
end

#parse_record(io, trim: true) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/csvreader/parser.rb', line 111

def parse_record( io, trim: true )
  values = []

  loop do
     value = parse_field( io, trim: trim )
     puts "value: »#{value}«"
     values << value

     if io.eof?
        break
     elsif (c=io.peek; c==LF || c==CR)
       skip_newlines( io )
       break
     elsif io.peek == ","
       io.getc   ## eat-up FS(,)
     else
       puts "*** csv parse error: found >#{io.peek} (#{io.peek.ord})< - FS (,) or RS (\\n) expected!!!!"
       exit(1)
     end
  end

  values
end

#parse_spaces(io) ⇒ Object

helper method



164
165
166
167
168
169
170
171
# File 'lib/csvreader/parser.rb', line 164

def parse_spaces( io )  ## helper method
  spaces = ""
  ## add leading spaces
  while (c=io.peek; c==SPACE || c==TAB)
    spaces << io.getc   ## eat-up all spaces (" ") and tabs (\t)
  end
  spaces
end

#skip_newlines(io) ⇒ Object



136
137
138
139
140
141
142
# File 'lib/csvreader/parser.rb', line 136

def skip_newlines( io )
  return if io.eof?

  while (c=io.peek; c==LF || c==CR)
    io.getc    ## eat-up all \n and \r
  end
end

#skip_spaces(io) ⇒ Object



153
154
155
156
157
158
159
# File 'lib/csvreader/parser.rb', line 153

def skip_spaces( io )
  return if io.eof?

  while (c=io.peek; c==SPACE || c==TAB)
    io.getc   ## note: always eat-up all spaces (" ") and tabs (\t)
  end
end

#skip_until_eol(io) ⇒ Object



145
146
147
148
149
150
151
# File 'lib/csvreader/parser.rb', line 145

def skip_until_eol( io )
  return if io.eof?

  while (c=io.peek; !(c==LF || c==CR || io.eof?))
    io.getc    ## eat-up all until end of line
  end
end