Class: Censive

Inherits:
StringScanner
  • Object
show all
Defined in:
lib/censive.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str = nil, drop: false, encoding: nil, excel: false, mode: :compact, out: nil, quote: '"', relax: false, rowsep: "\n", sep: ",", strip: false, **opts) ⇒ Censive



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/censive.rb', line 45

def initialize(str=nil,
  drop:     false   , # drop trailing empty columns?
  encoding: nil     , # character encoding
  excel:    false   , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
  mode:     :compact, # export mode: compact or full
  out:      nil     , # output stream, needs to respond to <<
  quote:    '"'     , # quote character
  relax:    false   , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
  rowsep:   "\n"    , # row separator for export
  sep:      ","     , # column separator character
  strip:    false   , # strip columns when reading
  **opts              # grab bag
)
  # initialize data source
  if str && str.size < 100 && File.readable?(str)
    str = File.open(str, encoding ? "r:#{encoding}" : "r").read
  else
    str ||= ""
    str = str.encode(encoding) if encoding
  end
  super(str)
  reset

  # config options
  @cheat    = true
  @drop     = drop
  @encoding = str.encoding
  @excel    = excel
  @mode     = mode
  @out      = out || $stdout
  @relax    = relax
  @strip    = strip

  # config strings
  @quote    = quote
  @rowsep   = rowsep
  @sep      = sep

  # static strings
  @cr       = "\r"
  @lf       = "\n"
  @es       = ""
  @eq       = "="

  # combinations
  @esc      = (@quote * 2)
  @seq      = [@sep, @eq].join # used for parsing in excel mode

  # regexes
  @eoc      = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
  @eol      = /#{@cr}#{@lf}?|#{@lf}/o                # end of line
  @escapes  = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
  @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
  @quotes   = /#{@quote}/o
  @seps     = /#{@sep}+/o
  @quoted   = @excel ? /(?:=)?#{@quote}/o : @quote
  @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
  @leadzero = /\A0\d*\z/
end

Instance Attribute Details

#encodingObject (readonly)

Returns the value of attribute encoding.



31
32
33
# File 'lib/censive.rb', line 31

def encoding
  @encoding
end

#outObject (readonly)

Returns the value of attribute out.



31
32
33
# File 'lib/censive.rb', line 31

def out
  @out
end

#rowsObject (readonly)

Returns the value of attribute rows.



31
32
33
# File 'lib/censive.rb', line 31

def rows
  @rows
end

Class Method Details

.parseObject



33
34
35
# File 'lib/censive.rb', line 33

def self.parse(...)
  new(...).parse
end

.writer(obj = nil, **opts, &code) ⇒ Object



37
38
39
40
41
42
43
# File 'lib/censive.rb', line 37

def self.writer(obj=nil, **opts, &code)
  case obj
  when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
  when IO,nil then new(out: obj, **opts, &code)
  else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
  end
end

Instance Method Details

#<<(row) ⇒ Object

output a row



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/censive.rb', line 200

def <<(row)

  # drop trailing empty columns
  row.pop while row.last.empty? if @drop

  s,q = @sep, @quote
  out = case @mode
  when :compact
    case @excel ? 2 : grok(row.join)
    when 0
      row
    when 1
      row.map do |col|
        col.match?(@quotable) ? "#{q}#{col}#{q}" : col
      end
    else
      row.map do |col|
        @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
        case grok(col)
        when 0 then col
        when 1 then "#{q}#{col}#{q}"
        else        "#{q}#{col.gsub(q, @esc)}#{q}"
        end
      end
    end
  when :full
    if @excel
      row.map do |col|
        col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
      end
    else
      row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
    end
  end.join(s)

  @out << out + @rowsep
end

#bomb(msg) ⇒ Object



246
247
248
# File 'lib/censive.rb', line 246

def bomb(msg)
  abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
end

#eachObject



177
178
179
180
# File 'lib/censive.rb', line 177

def each
  @rows ||= parse
  @rows.each {|row| yield row }
end

#export(**opts) ⇒ Object



182
183
184
185
186
# File 'lib/censive.rb', line 182

def export(**opts)
  dest = opts.empty? ? self : self.class.writer(**opts)
  each {|row| dest << row }
  dest
end

#grok(str) ⇒ Object

returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)



191
192
193
194
195
196
197
# File 'lib/censive.rb', line 191

def grok(str)
  if idx = str.index(@escapes)
    $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
  else
    0
  end
end

#next_rowObject



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/censive.rb', line 127

def next_row
  if @cheat and line = scan_until(@eol)
    row = line.chomp!.split(@sep, -1)
    row.each do |col|
      next if (saw = col.count(@quote)).zero?
      next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
      @cheat = false
      break
    end if line.include?(@quote)
    @cheat and return @strip ? row.each(&:strip!) : row
    unscan
  end

  token = next_token or return
  row = []
  row.push(*token)
  row.push(*token) while token = next_token
  row
end

#next_tokenObject



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/censive.rb', line 147

def next_token
  if scan(@quoted) # quoted cell
    token = ""
    while true
      token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
      token << @quote and next if scan(@quote)
      scan(@eoc) and break
      @relax or bomb "invalid character after quote"
      token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
    end
    scan(@sep)
    @strip ? token.strip : token
  elsif match = scan(@unquoted) # unquoted cell(s)
    if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
      unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
        match << (scan_until(@eoc) or bomb "stray quote")
        scan(@sep)
      end
    end
    tokens = match.split(@sep, -1)
    @strip ? tokens.map!(&:strip) : tokens
  elsif scan(@sep)
    match = scan(@seps)
    match ? match.split(@sep, -1) : @es
  else
    scan(@eol)
    nil
  end
end

#parseObject

[ Parser ]==



116
117
118
119
120
121
122
123
124
125
# File 'lib/censive.rb', line 116

def parse
  @rows = []
  while row = next_row
    @rows << row
    count = row.size
    @cols = count if count > @cols
    @cells += count
  end
  @rows
end

#reset(str = nil) ⇒ Object



105
106
107
108
109
110
111
112
# File 'lib/censive.rb', line 105

def reset(str=nil)
  @rows = nil
  @cols = @cells = 0

  self.string = str if str
  @encoding = string.encoding
  super()
end

#statsObject



238
239
240
241
242
243
244
# File 'lib/censive.rb', line 238

def stats
  wide = string.size.to_s.size
  puts "%#{wide}d rows"    % @rows.size
  puts "%#{wide}d columns" % @cols
  puts "%#{wide}d cells"   % @cells
  puts "%#{wide}d bytes"   % string.size
end