Class: ImportCSV
- Inherits:
-
Object
- Object
- ImportCSV
- Defined in:
- lib/import_csv.rb
Instance Attribute Summary collapse
-
#background_task ⇒ Object
Returns the value of attribute background_task.
-
#current ⇒ Object
current line in csv file if header has been define.
-
#current_preload ⇒ Object
current preload data.
-
#define_header_by_your_self ⇒ Object
Returns the value of attribute define_header_by_your_self.
-
#file_eof ⇒ Object
Returns the value of attribute file_eof.
-
#file_object ⇒ Object
file object (File).
-
#file_path ⇒ Object
file path (string).
-
#has_header ⇒ Object
header (Boolean).
-
#header ⇒ Object
header (Array).
-
#line_count ⇒ Object
current line number in file.
-
#next_preload ⇒ Object
Returns the value of attribute next_preload.
-
#parse_options ⇒ Object
set parse options.
-
#preload ⇒ Object
preload data.
-
#query ⇒ Object
Returns the value of attribute query.
-
#return_preload_only ⇒ Object
Boolean.
Class Method Summary collapse
Instance Method Summary collapse
-
#_background_preload ⇒ Object
Experimental Currently not working.
- #_preload ⇒ Object
- #clear_filter ⇒ Object
- #create_hash ⇒ Object
-
#each ⇒ Object
Loop through csv file.
-
#each_preload(return_hash = self.has_header) ⇒ Object
return preload data, not a single line but preload data.
-
#file_name ⇒ Object
return file name.
- #get_header_index(header) ⇒ Object
-
#header_generator ⇒ Object
create atribute based on header.
-
#initialize(file_path, options = Hash.new) ⇒ ImportCSV
constructor
Class constructor.
-
#next ⇒ Object
Get next line from CSV.
-
#perform_filter(query = self.query) ⇒ Object
Perform preload with filter data.
-
#perform_preload ⇒ Object
Get n line from csv file and parse.
-
#reopen ⇒ Object
if object from this class will reuse, call this method to reopen file so you can read file again.
-
#set_header(header) ⇒ Object
(also: #define_header)
set header, so you can call atribute based on header.
-
#where(query = Hash.new) ⇒ Object
Set filter.
Constructor Details
#initialize(file_path, options = Hash.new) ⇒ ImportCSV
Class constructor. set file path and preload data if block given, then will call each so you can add block line using each
Example :
ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv')) do |line|
p line.current[0]
p line.current[1]
... your code ...
end
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/import_csv.rb', line 181 def initialize(file_path, = Hash.new) self.preload = [:preload] || 1000 self.line_count = 1 self.file_path = file_path self. = [:parse_options] || {} self.return_preload_only = [:return_preload_only] || false self.file_object = File.open(self.file_path, 'r') self.background_task = nil self.next_preload = [] self.file_eof = false if [:header] self.has_header = true if [:header].kind_of?(Array) self.header = [:header] end # create attribute self.header_generator end self.current = [] self.current_preload = [] self.query = Hash.new if block_given? self.each { |line, line_count| yield line, line_count } else self end end |
Instance Attribute Details
#background_task ⇒ Object
Returns the value of attribute background_task.
164 165 166 |
# File 'lib/import_csv.rb', line 164 def background_task @background_task end |
#current ⇒ Object
current line in csv file if header has been define
150 151 152 |
# File 'lib/import_csv.rb', line 150 def current @current end |
#current_preload ⇒ Object
current preload data
153 154 155 |
# File 'lib/import_csv.rb', line 153 def current_preload @current_preload end |
#define_header_by_your_self ⇒ Object
Returns the value of attribute define_header_by_your_self.
147 148 149 |
# File 'lib/import_csv.rb', line 147 def define_header_by_your_self @define_header_by_your_self end |
#file_eof ⇒ Object
Returns the value of attribute file_eof.
168 169 170 |
# File 'lib/import_csv.rb', line 168 def file_eof @file_eof end |
#file_object ⇒ Object
file object (File)
140 141 142 |
# File 'lib/import_csv.rb', line 140 def file_object @file_object end |
#file_path ⇒ Object
file path (string)
137 138 139 |
# File 'lib/import_csv.rb', line 137 def file_path @file_path end |
#has_header ⇒ Object
header (Boolean)
146 147 148 |
# File 'lib/import_csv.rb', line 146 def has_header @has_header end |
#header ⇒ Object
header (Array)
143 144 145 |
# File 'lib/import_csv.rb', line 143 def header @header end |
#line_count ⇒ Object
current line number in file
134 135 136 |
# File 'lib/import_csv.rb', line 134 def line_count @line_count end |
#next_preload ⇒ Object
Returns the value of attribute next_preload.
166 167 168 |
# File 'lib/import_csv.rb', line 166 def next_preload @next_preload end |
#parse_options ⇒ Object
set parse options
160 161 162 |
# File 'lib/import_csv.rb', line 160 def end |
#preload ⇒ Object
preload data
131 132 133 |
# File 'lib/import_csv.rb', line 131 def preload @preload end |
#query ⇒ Object
Returns the value of attribute query.
162 163 164 |
# File 'lib/import_csv.rb', line 162 def query @query end |
#return_preload_only ⇒ Object
Boolean. If true, will loop through file and send current preload data to block function
157 158 159 |
# File 'lib/import_csv.rb', line 157 def return_preload_only @return_preload_only end |
Class Method Details
.export(file) ⇒ Object
738 739 740 741 742 743 744 745 746 747 |
# File 'lib/import_csv.rb', line 738 def self.export(file) if block_given? File.open(file, 'w') File.open(file, 'a') do |file| yield file end else raise NotImplementedError, "block must given." end end |
Instance Method Details
#_background_preload ⇒ Object
Experimental Currently not working
run preload on background
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 |
# File 'lib/import_csv.rb', line 422 def _background_preload mutex = Mutex.new self.background_task = Thread.fork do mutex.synchronize do # for temporary data before parse to array _row = String.new for i in 1.upto(self.preload) # if self.file_object.eof? # parse last data # break # else begin # add line in file to temporary data _row += self.file_object.readline rescue EOFError => e self.file_eof = true break end # end end # parse data begin self.next_preload = CSV.parse(_row, self.) rescue => e debugger raise e end end Thread.current.exit end end |
#_preload ⇒ Object
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 |
# File 'lib/import_csv.rb', line 455 def _preload return false if self.file_object.eof? # for temporary data before parse to array row = '' for i in 1.upto(self.preload) if self.file_object.eof? # parse last data # self.current_preload = CSV.parse(row, self.parse_options) break else # add line in file to temporary data row << self.file_object.readline end end # parse data self.current_preload = CSV.parse(row, self.) return true end |
#clear_filter ⇒ Object
581 582 583 584 |
# File 'lib/import_csv.rb', line 581 def clear_filter self.query = Hash.new self end |
#create_hash ⇒ Object
243 244 245 246 247 248 249 |
# File 'lib/import_csv.rb', line 243 def create_hash result_hash = [] self.current_preload.each do |preload| result_hash << Hash[self.header.zip(preload)] end return result_hash end |
#each ⇒ Object
Loop through csv file.
Example :
csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
csv.each do |line, line_count|
p line[0]
p line[1]
... your code ...
end
Example :
csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
csv.each do |line, line_count|
csv.location_id
csv.location_name
... your code ...
end
325 326 327 328 329 330 331 332 333 334 335 |
# File 'lib/import_csv.rb', line 325 def each if self.has_header while self.next yield self, self.line_count end else while self.next yield self.current, self.line_count end end end |
#each_preload(return_hash = self.has_header) ⇒ Object
return preload data, not a single line but preload data. Size of preload data is depend on preload attribute, default is 1000
this method can be usefull for parallel processing
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
# File 'lib/import_csv.rb', line 219 def each_preload(return_hash = self.has_header) if self.query.empty? if return_hash while self.perform_preload yield create_hash end else while self.perform_preload yield self.current_preload end end else if return_hash while self.perform_filter yield create_hash end else while self.perform_filter yield self.current_preload end end end end |
#file_name ⇒ Object
return file name
210 211 212 |
# File 'lib/import_csv.rb', line 210 def file_name File.basename self.file_object end |
#get_header_index(header) ⇒ Object
275 276 277 278 |
# File 'lib/import_csv.rb', line 275 def get_header_index(header) return header if header.kind_of?(Integer) return self.header.index(header.to_s.downcase) end |
#header_generator ⇒ Object
create atribute based on header. you don’t have to call this method in your code.
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 |
# File 'lib/import_csv.rb', line 284 def header_generator if self.has_header if self.header.kind_of?(Array) header_line = self.header else self.header = CSV.parse(self.file_object.readline).first.map(&:downcase) header_line = self.header end # create atribute based on csv header header_line.each_with_index do |header, index| self.define_singleton_method(header.downcase.gsub(/[^A-Za-z]/, '_')) do return self.current[index] end end self.define_singleton_method(:[]) do |key| return self.current[header_line.index(key.downcase)] end return self.header else return false end end |
#next ⇒ Object
Get next line from CSV. This method actualy return data from preload variable, if preload empty this method will call perform_preload or filter -if query is not empty- to fill data to preload variable.
After call this method, data will store on object (in current variable or in method with same name with header).
Example :
csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
csv.next
p csv.location_id => `return first line`
csv.next
p csv.location_id => `return second line`
Example using while :
csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
while csv.next
p csv.location_id
end
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 |
# File 'lib/import_csv.rb', line 357 def next # if current_preload is empty, this method will call `perform_preload` or # `filter` to fill current_preload with the data. if self.current_preload.empty? # if cursor reach end of file, then will return false. This is usefull if # you call this method in `while`. See example above. if self.file_object.eof? self.current = [] return false else # determine which preload method will call, perform_preload which is get # data without any filter. # # to perfom `filter preload` you must set filter in `where` method. See # example in that method. if self.query.empty? self.perform_preload else self.perform_filter end end end # if preload method above return empty data, return false. if !self.current_preload.empty? # set current parsed line from first element in `current` attribute self.current = self.current_preload.first # delete first element in current_preload atribute self.current_preload.shift self.line_count += 1 return true else return false end end |
#perform_filter(query = self.query) ⇒ Object
Perform preload with filter data. Call where with query first before call this method.
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 |
# File 'lib/import_csv.rb', line 589 def perform_filter(query = self.query) # list of operator that can be used. filter_operation = ['>', '<', '!', '%'] col_sep = self.[:col_sep] || ',' row_sep = self.[:row_sep] || "\n" # temporary preload data. row = [] # first loop to make sure temporary preload size is same as defined preload # size loop do row_tmp = [] # preload data for i in 1.upto(self.preload) if self.file_object.eof? # break the loop if reach end of line break else # get line and split to array elament by column separator, _row_tmp = self.file_object.readline.split(col_sep) is_insert = false # loop throug query difined in `where` query.each do |key, values| # if value is Range, (integer)1..2 or (date)01-01-2015..01-01-2016 if values.kind_of?(Range) # scan for data type insert brackets. # if values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "date" if values.first.kind_of?(Date) || values.first.kind_of?(Time) # tmp_range_first = Date.parse(values.first[6..values.first.size]) tmp_range_first = values.first tmp_range_last = values.last # remove quote \" and `new line` from string tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '')) # elsif values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer" elsif values.first.kind_of?(Integer) || values.first.kind_of?(Float) # value with type integer will convert to float # tmp_range_first = (values.first[9..values.first.size]).to_f tmp_range_first = (values.first).to_f tmp_range_last = (values.last).to_f tmp_value = (_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '')).to_f end # comparation if (tmp_value >= tmp_range_first) && (tmp_value <= tmp_range_last) is_insert = true break else is_insert = false end else # if value from query is not array, then will be conver to array # with only one element if !values.kind_of?(Array) values = [values] end #loop throug value from query values.each do |value| # check operator from first caracter in value, is any operator # define or not. if not, then will goto `equal` if filter_operation.include?(value[0]) if value[0] === '>' || value[0] === '<' # scan for data type insert brackets. if value.scan(/\(([^\)]+)\)/)[0][0].downcase == "date" tmp_filter = Date.parse(value[7..value.size]) # remove quote \" and `new line` from string tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '')) elsif value.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer" tmp_filter = value[10..value.size].to_f tmp_value = _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').to_f end # comparation if value[0] === '>' if tmp_value > tmp_filter is_insert = true break else is_insert = false end elsif value[0] === '<' if tmp_value < tmp_filter is_insert = true break else is_insert = false end end elsif value[0] === '!' # remove quote \" and `new line` from string if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') != value[1..value.size] is_insert = true else is_insert = false break end elsif value[0] === '%' # remove quote \" and `new line` from string if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').include?(value[1..value.size]) is_insert = true break else is_insert = false end else # raise an ArgumentError (Exception) if opertor is not one # of which has been defined raise ArgumentError, 'Operator not allowed. Use one of this [>, <, !, %].' end else # remove quote \" and `new line` from string if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') === (value) is_insert = true break else is_insert = false end end end end # go to next line if query return false break if !is_insert end # insert to temporary accepted row if all query return true row_tmp << _row_tmp.join(col_sep) if is_insert end end row.push(*row_tmp) if row_tmp.size > 0 break if row.size >= self.preload || self.file_object.eof? end self.current_preload = CSV.parse(row.join(col_sep).gsub("#{row_sep},", row_sep), self.) end |
#perform_preload ⇒ Object
Get n line from csv file and parse. n is preload attribute. Default value for preload is 1000, you can change this value in this class constructor. See initialize method for more example.
This method will return false if cursor has been reach end of line in csv file. Otherwise, return true.
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 |
# File 'lib/import_csv.rb', line 399 def perform_preload _preload # if self.background_task.nil? # if _preload # _background_preload # return true # else # return false # end # else # return false if self.file_eof # ThreadsWait.join(self.background_task) # self.current_preload = self.next_preload # _background_preload # return true # end end |
#reopen ⇒ Object
if object from this class will reuse, call this method to reopen file so you can read file again.
725 726 727 728 729 730 731 732 733 734 735 736 |
# File 'lib/import_csv.rb', line 725 def reopen begin self.file_object = File.open(self.file_path, 'r') self.line_count = 1 self.current = [] self.current_preload = [] self.file_object.readline if self.has_header && !self.define_header_by_your_self true rescue => e raise e end end |
#set_header(header) ⇒ Object Also known as: define_header
set header, so you can call atribute based on header. Example:
csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
csv.set_header ['location_id', 'location_date', 'departure_date']
csv.each do
p csv.location_id
p csv.location_date
... your code ...
end
Parameter must be an array, if not, will raise an ArgumentError
263 264 265 266 267 268 269 270 271 272 273 |
# File 'lib/import_csv.rb', line 263 def set_header(header) if header.kind_of?(Array) # set header & has_header self.header = header.map(&:downcase) self.has_header = true self.define_header_by_your_self = true self.header_generator else raise ArgumentError, "header must be an array" end end |
#where(query = Hash.new) ⇒ Object
Set filter. You can use this operator [‘>’, ‘<’, ‘!’, ‘%’] and Range to perform filter.
Before set filter, you must set header true or define header by yourself. For set header true and define header, see example above.
Example:
CSV data:
___________________________
|id | name | birth |
|1 | shania | 27-06-1998|
|2 | jessica | 19-08-1993|
|3 | michelle| 28-10-1999|
|___|__________|___________|
- Equal.
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(name: 'michelle')
... use csv.each or while csv.next ...
=> will return [3, 'michelle', '28-10-1999']
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(name: ['shania', 'jessica'])
... use csv.each or while csv.next ...
=> will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
- Range. Only for Date, Integer and Float data type. Define datatype in
first range. Use `integer` for Integer & Float, use `date` for Date.
See Example below.
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(id: '(integer)1'..'2')
... use csv.each or while csv.next ...
=> will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(birth: '(date)01-01-1993'..'01-01-1999')
... use csv.each or while csv.next ...
=> will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993'], [3, 'michelle', '28-10-1999']]
- Operator '>' & '<'. Only for column with data type Integer, Float or Date
Like `id` or `birth` in example csv data above.
- Data type must defined in filter, use `integer` for Integer or Float
and use `date` for Date. Put operator & data type together without
space. See example below.
- For filter with Date data type (in csv or in filter), any value that
can be parse using `Date.parse` are acceptable.
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(id: '>(integer)1')
... use csv.each or while csv.next ...
=> will return [[2, 'jessica', '19-08-1993'], [1, 'michelle', '28-10-1999']]
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(birth: '<(date)01-01-1997')
... use csv.each or while csv.next ...
=> will return [2, 'jessica', '19-08-1993']
- Operator '!'. Put this operator in first character
and folow with query without space.
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(name: '!michelle')
... use csv.each or while csv.next ...
=> will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(name: ['!shania', '!jessica'])
... use csv.each or while csv.next ...
=> will return [3, 'michelle', '28-10-1999']
- Operator '%'. `Like` Operator. Put this operator in first character and
folow with query without space.
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(name: '%jes')
... use csv.each or while csv.next ...
=> will return [2, 'jessica', '19-08-1993']
* csv = ImportCSV.new('member.csv'), header: true)
csv.where(name: ['%jes', '%shan'])
... use csv.each or while csv.next ...
=> will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
Note :
- Data type must define if you use `<` or `>` or in Range. Don't define
data type in other operator.
559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 |
# File 'lib/import_csv.rb', line 559 def where(query = Hash.new) # if !self.has_header # raise ArgumentError, 'Header not detected.' # end query.each do |key, values| if values.kind_of?(Range) if values.first.kind_of?(String) || values.last.kind_of?(String) raise ArgumentError, "Range filter only accept Date, Time, Integer or Float data type." end if values.first > values.last raise ArgumentError, "First value is larger than last value." end end end self.query = self.query.merge(query) # for chaining self end |