Class: ImportCSV

Inherits:
Object
  • Object
show all
Defined in:
lib/import_csv.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_path, options = Hash.new) ⇒ ImportCSV

Class constructor. set file path and preload data if block given, then will call each so you can add block line using each

Example :

ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv')) do |line|
  p line.current[0]
  p line.current[1]
  ... your code ...
end


181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/import_csv.rb', line 181

def initialize(file_path, options = Hash.new)
  self.preload = options[:preload] || 1000
  self.line_count = 1
  self.file_path = file_path
  self.parse_options = options[:parse_options] || {}
  self.return_preload_only = options[:return_preload_only] || false
  self.file_object = File.open(self.file_path, 'r')
  self.background_task = nil
  self.next_preload = []
  self.file_eof = false
  if options[:header]
    self.has_header = true
    if options[:header].kind_of?(Array)
      self.header = options[:header]
    end
    # create attribute
    self.header_generator
  end
  self.current = []
  self.current_preload = []
  self.query = Hash.new
  if block_given?
    self.each { |line, line_count| yield line, line_count }
  else
    self
  end
end

Instance Attribute Details

#background_taskObject

Returns the value of attribute background_task.



164
165
166
# File 'lib/import_csv.rb', line 164

def background_task
  @background_task
end

#currentObject

current line in csv file if header has been define



150
151
152
# File 'lib/import_csv.rb', line 150

def current
  @current
end

#current_preloadObject

current preload data



153
154
155
# File 'lib/import_csv.rb', line 153

def current_preload
  @current_preload
end

#define_header_by_your_selfObject

Returns the value of attribute define_header_by_your_self.



147
148
149
# File 'lib/import_csv.rb', line 147

def define_header_by_your_self
  @define_header_by_your_self
end

#file_eofObject

Returns the value of attribute file_eof.



168
169
170
# File 'lib/import_csv.rb', line 168

def file_eof
  @file_eof
end

#file_objectObject

file object (File)



140
141
142
# File 'lib/import_csv.rb', line 140

def file_object
  @file_object
end

#file_pathObject

file path (string)



137
138
139
# File 'lib/import_csv.rb', line 137

def file_path
  @file_path
end

#has_headerObject

header (Boolean)



146
147
148
# File 'lib/import_csv.rb', line 146

def has_header
  @has_header
end

#headerObject

header (Array)



143
144
145
# File 'lib/import_csv.rb', line 143

def header
  @header
end

#line_countObject

current line number in file



134
135
136
# File 'lib/import_csv.rb', line 134

def line_count
  @line_count
end

#next_preloadObject

Returns the value of attribute next_preload.



166
167
168
# File 'lib/import_csv.rb', line 166

def next_preload
  @next_preload
end

#parse_optionsObject

set parse options



160
161
162
# File 'lib/import_csv.rb', line 160

def parse_options
  @parse_options
end

#preloadObject

preload data



131
132
133
# File 'lib/import_csv.rb', line 131

def preload
  @preload
end

#queryObject

Returns the value of attribute query.



162
163
164
# File 'lib/import_csv.rb', line 162

def query
  @query
end

#return_preload_onlyObject

Boolean. If true, will loop through file and send current preload data to block function



157
158
159
# File 'lib/import_csv.rb', line 157

def return_preload_only
  @return_preload_only
end

Class Method Details

.export(file) ⇒ Object



738
739
740
741
742
743
744
745
746
747
# File 'lib/import_csv.rb', line 738

def self.export(file)
  if block_given?
    File.open(file, 'w')
    File.open(file, 'a') do |file|
      yield file
    end
  else
    raise NotImplementedError, "block must given."
  end
end

Instance Method Details

#_background_preloadObject

Experimental Currently not working

run preload on background



422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
# File 'lib/import_csv.rb', line 422

def _background_preload
  mutex = Mutex.new
  self.background_task = Thread.fork do
    mutex.synchronize do
      # for temporary data before parse to array
      _row = String.new
      for i in 1.upto(self.preload)
        # if self.file_object.eof?
          # parse last data
          # break
        # else
        begin
          # add line in file to temporary data
          _row += self.file_object.readline
        rescue EOFError => e
          self.file_eof = true
          break
        end
        # end
      end

      # parse data
      begin
        self.next_preload = CSV.parse(_row, self.parse_options)
      rescue => e
        debugger
        raise e
      end
    end
    Thread.current.exit
  end
end

#_preloadObject



455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
# File 'lib/import_csv.rb', line 455

def _preload
  return false if self.file_object.eof?

  # for temporary data before parse to array
  row = ''
  for i in 1.upto(self.preload)
    if self.file_object.eof?
      # parse last data
      # self.current_preload = CSV.parse(row, self.parse_options)
      break
    else
      # add line in file to temporary data
      row << self.file_object.readline
    end
  end
  # parse data
  self.current_preload = CSV.parse(row, self.parse_options)
  return true
end

#clear_filterObject



581
582
583
584
# File 'lib/import_csv.rb', line 581

def clear_filter
  self.query = Hash.new
  self
end

#create_hashObject



243
244
245
246
247
248
249
# File 'lib/import_csv.rb', line 243

def create_hash
  result_hash = []
  self.current_preload.each do |preload|
    result_hash << Hash[self.header.zip(preload)]
  end
  return result_hash
end

#eachObject

Loop through csv file.

Example :

csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
csv.each do |line, line_count|
  p line[0]
  p line[1]
  ... your code ...
end

Example :

csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
csv.each do |line, line_count|
  csv.location_id
  csv.location_name
  ... your code ...
end


325
326
327
328
329
330
331
332
333
334
335
# File 'lib/import_csv.rb', line 325

def each
  if self.has_header
    while self.next
      yield self, self.line_count
    end
  else
    while self.next
      yield self.current, self.line_count
    end
  end
end

#each_preload(return_hash = self.has_header) ⇒ Object

return preload data, not a single line but preload data. Size of preload data is depend on preload attribute, default is 1000

this method can be usefull for parallel processing



219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/import_csv.rb', line 219

def each_preload(return_hash = self.has_header)
  if self.query.empty?
    if return_hash
      while self.perform_preload
        yield create_hash
      end
    else
      while self.perform_preload
        yield self.current_preload
      end
    end
  else
    if return_hash
      while self.perform_filter
        yield create_hash
      end
    else
      while self.perform_filter
        yield self.current_preload
      end
    end
  end
end

#file_nameObject

return file name



210
211
212
# File 'lib/import_csv.rb', line 210

def file_name
  File.basename self.file_object
end

#get_header_index(header) ⇒ Object



275
276
277
278
# File 'lib/import_csv.rb', line 275

def get_header_index(header)
  return header if header.kind_of?(Integer)
  return self.header.index(header.to_s.downcase)
end

#header_generatorObject

create atribute based on header. you don’t have to call this method in your code.



284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# File 'lib/import_csv.rb', line 284

def header_generator
  if self.has_header
    if self.header.kind_of?(Array)
      header_line = self.header
    else
      self.header = CSV.parse(self.file_object.readline).first.map(&:downcase)
      header_line = self.header
    end
    # create atribute based on csv header
    header_line.each_with_index do |header, index|
      self.define_singleton_method(header.downcase.gsub(/[^A-Za-z]/, '_')) do
        return self.current[index]
      end
    end
    self.define_singleton_method(:[]) do |key|
      return self.current[header_line.index(key.downcase)]
    end
    return self.header
  else
    return false
  end
end

#nextObject

Get next line from CSV. This method actualy return data from preload variable, if preload empty this method will call perform_preload or filter -if query is not empty- to fill data to preload variable.

After call this method, data will store on object (in current variable or in method with same name with header).

Example :

csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
csv.next
p csv.location_id => `return first line`
csv.next
p csv.location_id => `return second line`

Example using while :

csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
while csv.next
  p csv.location_id
end


357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# File 'lib/import_csv.rb', line 357

def next
  # if current_preload is empty, this method will call `perform_preload` or
  # `filter` to fill current_preload with the data.
  if self.current_preload.empty?
    # if cursor reach end of file, then will return false. This is usefull if
    # you call this method in `while`. See example above.
    if self.file_object.eof?
      self.current = []
      return false
    else
      # determine which preload method will call, perform_preload which is get
      # data without any filter.
      #
      # to perfom `filter preload` you must set filter in `where` method. See
      # example in that method.
      if self.query.empty?
        self.perform_preload
      else
        self.perform_filter
      end
    end
  end
  # if preload method above return empty data, return false.
  if !self.current_preload.empty?
    # set current parsed line from first element in `current` attribute
    self.current = self.current_preload.first
    # delete first element in current_preload atribute
    self.current_preload.shift
    self.line_count += 1
    return true
  else
    return false
  end
end

#perform_filter(query = self.query) ⇒ Object

Perform preload with filter data. Call where with query first before call this method.



589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
# File 'lib/import_csv.rb', line 589

def perform_filter(query = self.query)
  # list of operator that can be used.
  filter_operation = ['>', '<', '!', '%']
  col_sep = self.parse_options[:col_sep] || ','
  row_sep = self.parse_options[:row_sep] || "\n"
  # temporary preload data.
  row = []
  # first loop to make sure temporary preload size is same as defined preload
  # size
  loop do
    row_tmp = []
    # preload data
    for i in 1.upto(self.preload)
      if self.file_object.eof?
        # break the loop if reach end of line
        break
      else
        # get line and split to array elament by column separator,
        _row_tmp = self.file_object.readline.split(col_sep)
        is_insert = false
        # loop throug query difined in `where`
        query.each do |key, values|
          # if value is Range, (integer)1..2 or (date)01-01-2015..01-01-2016
          if values.kind_of?(Range)
            # scan for data type insert brackets.
            # if values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "date"
            if values.first.kind_of?(Date) || values.first.kind_of?(Time)
              # tmp_range_first = Date.parse(values.first[6..values.first.size])
              tmp_range_first = values.first
              tmp_range_last = values.last
              # remove quote \" and `new line` from string
              tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, ''))
            # elsif values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer"
            elsif values.first.kind_of?(Integer) || values.first.kind_of?(Float)
              # value with type integer will convert to float
              # tmp_range_first = (values.first[9..values.first.size]).to_f
              tmp_range_first = (values.first).to_f
              tmp_range_last = (values.last).to_f
              tmp_value = (_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '')).to_f
            end

            # comparation
            if (tmp_value >= tmp_range_first) && (tmp_value <= tmp_range_last)
              is_insert = true
              break
            else
              is_insert = false
            end
          else
            # if value from query is not array, then will be conver to array
            # with only one element
            if !values.kind_of?(Array)
              values = [values]
            end

            #loop throug value from query
            values.each do |value|
              # check operator from first caracter in value, is any operator
              # define or not. if not, then will goto `equal`
              if filter_operation.include?(value[0])
                if value[0] === '>' || value[0] === '<'
                  # scan for data type insert brackets.
                  if value.scan(/\(([^\)]+)\)/)[0][0].downcase == "date"
                    tmp_filter = Date.parse(value[7..value.size])
                    # remove quote \" and `new line` from string
                    tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, ''))
                  elsif value.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer"
                    tmp_filter = value[10..value.size].to_f
                    tmp_value = _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').to_f
                  end

                  # comparation
                  if value[0] === '>'
                    if tmp_value > tmp_filter
                      is_insert = true
                      break
                    else
                      is_insert = false
                    end
                  elsif value[0] === '<'
                    if tmp_value < tmp_filter
                      is_insert = true
                      break
                    else
                      is_insert = false
                    end
                  end

                elsif value[0] === '!'
                  # remove quote \" and `new line` from string
                  if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') != value[1..value.size]
                    is_insert = true
                  else
                    is_insert = false
                    break
                  end
                elsif value[0] === '%'
                  # remove quote \" and `new line` from string
                  if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').include?(value[1..value.size])
                    is_insert = true
                    break
                  else
                    is_insert = false
                  end

                else
                  # raise an ArgumentError (Exception) if opertor is not one
                  # of which has been defined
                  raise ArgumentError, 'Operator not allowed. Use one of this [>, <, !, %].'
                end
              else
                # remove quote \" and `new line` from string
                if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') === (value)
                  is_insert = true
                  break
                else
                  is_insert = false
                end
              end
            end
          end
          # go to next line if query return false
          break if !is_insert
        end
        # insert to temporary accepted row if all query return true
        row_tmp << _row_tmp.join(col_sep) if is_insert
      end
    end
    row.push(*row_tmp) if row_tmp.size > 0
    break if row.size >= self.preload || self.file_object.eof?
  end
  self.current_preload = CSV.parse(row.join(col_sep).gsub("#{row_sep},", row_sep), self.parse_options)
end

#perform_preloadObject

Get n line from csv file and parse. n is preload attribute. Default value for preload is 1000, you can change this value in this class constructor. See initialize method for more example.

This method will return false if cursor has been reach end of line in csv file. Otherwise, return true.



399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# File 'lib/import_csv.rb', line 399

def perform_preload
  _preload
  # if self.background_task.nil?
  #   if _preload
  #     _background_preload
  #     return true
  #   else
  #     return false
  #   end
  # else
  #   return false if self.file_eof
  #   ThreadsWait.join(self.background_task)
  #   self.current_preload = self.next_preload
  #   _background_preload
  #   return true
  # end
end

#reopenObject

if object from this class will reuse, call this method to reopen file so you can read file again.



725
726
727
728
729
730
731
732
733
734
735
736
# File 'lib/import_csv.rb', line 725

def reopen
  begin
    self.file_object = File.open(self.file_path, 'r')
    self.line_count = 1
    self.current = []
    self.current_preload = []
    self.file_object.readline if self.has_header && !self.define_header_by_your_self
    true
  rescue => e
    raise e
  end
end

#set_header(header) ⇒ Object Also known as: define_header

set header, so you can call atribute based on header. Example:

csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
csv.set_header ['location_id', 'location_date', 'departure_date']
csv.each do
  p csv.location_id
  p csv.location_date
  ... your code ...
end

Parameter must be an array, if not, will raise an ArgumentError



263
264
265
266
267
268
269
270
271
272
273
# File 'lib/import_csv.rb', line 263

def set_header(header)
  if header.kind_of?(Array)
    # set header & has_header
    self.header = header.map(&:downcase)
    self.has_header = true
    self.define_header_by_your_self = true
    self.header_generator
  else
    raise ArgumentError, "header must be an array"
  end
end

#where(query = Hash.new) ⇒ Object

Set filter. You can use this operator [‘>’, ‘<’, ‘!’, ‘%’] and Range to perform filter.

Before set filter, you must set header true or define header by yourself. For set header true and define header, see example above.

Example:

CSV data:
___________________________
|id |  name    | birth     |
|1  |  shania  | 27-06-1998|
|2  |  jessica | 19-08-1993|
|3  |  michelle| 28-10-1999|
|___|__________|___________|

- Equal.
    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(name: 'michelle')
      ... use csv.each or while csv.next ...
        => will return [3, 'michelle', '28-10-1999']

    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(name: ['shania', 'jessica'])
      ... use csv.each or while csv.next ...
        => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]

- Range. Only for Date, Integer and Float data type. Define datatype in
  first range. Use `integer` for Integer & Float, use `date` for Date.
  See Example below.
  * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(id: '(integer)1'..'2')
      ... use csv.each or while csv.next ...
      => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]

  * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(birth: '(date)01-01-1993'..'01-01-1999')
      ... use csv.each or while csv.next ...
       => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993'], [3, 'michelle', '28-10-1999']]

- Operator '>' & '<'. Only for column with data type Integer, Float or Date
  Like `id` or `birth` in example csv data above.
  - Data type must defined in filter, use `integer` for Integer or Float
    and use `date` for Date. Put operator & data type together without
    space. See example below.
  - For filter with Date data type (in csv or in filter), any value that
    can be parse using `Date.parse` are acceptable.
    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(id: '>(integer)1')
      ... use csv.each or while csv.next ...
      => will return [[2, 'jessica', '19-08-1993'], [1, 'michelle', '28-10-1999']]

    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(birth: '<(date)01-01-1997')
      ... use csv.each or while csv.next ...
      => will return [2, 'jessica', '19-08-1993']

- Operator '!'. Put this operator in first character
  and folow with query without space.
    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(name: '!michelle')
      ... use csv.each or while csv.next ...
        => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]

    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(name: ['!shania', '!jessica'])
      ... use csv.each or while csv.next ...
        => will return [3, 'michelle', '28-10-1999']

- Operator '%'. `Like` Operator. Put this operator in first character and
  folow with query without space.
    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(name: '%jes')
      ... use csv.each or while csv.next ...
        => will return [2, 'jessica', '19-08-1993']

    * csv = ImportCSV.new('member.csv'), header: true)
      csv.where(name: ['%jes', '%shan'])
      ... use csv.each or while csv.next ...
        => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]

Note :

- Data type must define if you use `<` or `>` or in Range. Don't define
  data type in other operator.


559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
# File 'lib/import_csv.rb', line 559

def where(query = Hash.new)
  # if !self.has_header
  #   raise ArgumentError, 'Header not detected.'
  # end

  query.each do |key, values|
    if values.kind_of?(Range)
      if values.first.kind_of?(String) || values.last.kind_of?(String)
        raise ArgumentError, "Range filter only accept Date, Time, Integer or Float data type."
      end

      if values.first > values.last
        raise ArgumentError, "First value is larger than last value."
      end
    end
  end

  self.query = self.query.merge(query)
  # for chaining
  self
end