Module: SmarterCSV

Defined in:: lib/smarter_csv.rb,
lib/smarter_csv/version.rb,
ext/smarter_csv/smarter_csv.c

Defined Under Namespace

Classes: DuplicateHeaders, HeaderSizeMismatch, IncorrectOption, KeyMappingError, MissingKeys, NoColSepDetected, SmarterCSVException, ValidationError

Constant Summary collapse

VERSION =

"1.9.0"

Class Method Summary collapse

.count_quote_chars(line, quote_char) ⇒ Object

Counts the number of quote characters in a line, excluding escaped quotes.
.has_acceleration? ⇒ Boolean
.headers ⇒ Object
.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ Object

max_size: pass nil if no limit is specified.
.process(input, options = {}, &block) ⇒ Object

first parameter: filename or input object which responds to readline method.
.raw_header ⇒ Object

Class Method Details

.count_quote_chars(line, quote_char) ⇒ `Object`

Counts the number of quote characters in a line, excluding escaped quotes.

# File 'lib/smarter_csv.rb', line 199

def count_quote_chars(line, quote_char)
  return 0 if line.nil? || quote_char.nil?

  count = 0
  previous_char = ''

  line.each_char do |char|
    count += 1 if char == quote_char && previous_char != '\\'
    previous_char = char
  end

  count
end

.has_acceleration? ⇒ `Boolean`

Returns:

(Boolean)



186
187
188

# File 'lib/smarter_csv.rb', line 186

def has_acceleration?
  @has_acceleration ||= !!defined?(parse_csv_line_c)
end

.headers ⇒ `Object`



194
195
196

# File 'lib/smarter_csv.rb', line 194

def headers
  @headers
end

.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ `Object`

max_size: pass nil if no limit is specified

# File 'ext/smarter_csv/smarter_csv.c', line 15

static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
  if (RB_TYPE_P(line, T_NIL) == 1) {
    return rb_ary_new();
  }

  if (RB_TYPE_P(line, T_STRING) != 1) {
    rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
  }

  rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
  char *startP = RSTRING_PTR(line); /* may not be null terminated */
  long line_len = RSTRING_LEN(line);
  char *endP = startP + line_len ; /* points behind the string */
  char *p = startP;

  char *col_sepP = RSTRING_PTR(col_sep);
  long col_sep_len = RSTRING_LEN(col_sep);

  char *quoteP = RSTRING_PTR(quote_char);
  long quote_count = 0;

  bool col_sep_found = true;

  VALUE elements = rb_ary_new();
  VALUE field;
  long i;

  char prev_char = '\0'; // Store the previous character for comparison against an escape character

  while (p < endP) {
    /* does the remaining string start with col_sep ? */
    col_sep_found = true;
    for(i=0; (i < col_sep_len) && (p+i < endP) ; i++) {
      col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
    }
    /* if col_sep was found and we have even quotes */
    if (col_sep_found && (quote_count % 2 == 0)) {
      /* if max_size != nil && lements.size >= header_size */
      if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
        break;
      } else {
        /* push that field with original encoding onto the results */
        field = rb_enc_str_new(startP, p - startP, encoding);
        rb_ary_push(elements, field);

        p += col_sep_len;
        startP = p;
      }
    } else {
      if (*p == *quoteP && prev_char != '\\') {
        quote_count += 1;
      }
      p++;
    }

    prev_char = *(p - 1); // Update the previous character
  } /* while */

  /* check if the last part of the line needs to be processed */
  if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
    /* copy the remaining line as a field with original encoding onto the results */
    field = rb_enc_str_new(startP, endP - startP, encoding);
    rb_ary_push(elements, field);
  }

  return elements;
}

.process(input, options = {}, &block) ⇒ `Object`

first parameter: filename or input object which responds to readline method

# File 'lib/smarter_csv.rb', line 20

def SmarterCSV.process(input, options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
  options = default_options.merge(options)
  options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
  puts "SmarterCSV OPTIONS: #{options.inspect}" if options[:verbose]
  validate_options!(options)

  headerA = []
  result = []
  @file_line_count = 0
  @csv_line_count = 0
  has_rails = !!defined?(Rails)
  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    # auto-detect the row separator
    options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
      puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
    end

    skip_lines(fh, options)

    headerA, header_size = process_headers(fh, options)

    # in case we use chunking.. we'll need to set it up..
    if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # now on to processing all the rest of the lines in the CSV file:
    until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
      line = readline_with_counts(fh, options)

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i

      print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # cater for the quoted csv data containing the row separator carriage return character
      # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
      # by detecting the existence of an uneven number of quote characters

      multiline = count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
      while count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
        next_line = fh.readline(options[:row_sep])
        next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
        line += next_line
        @file_line_count += 1
      end
      print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline

      line.chomp!(options[:row_sep])

      dataA, _data_size = parse(line, options, header_size)

      dataA.map!{|x| x.strip} if options[:strip_whitespace]

      # if all values are blank, then ignore this line
      next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))

      hash = Hash.zip(headerA, dataA) # from Facets of Ruby library

      # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
      hash.delete(nil)
      hash.delete('')
      hash.delete(:"")

      if options[:remove_empty_values] == true
        hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
      end

      hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
      hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]

      if options[:convert_values_to_numeric]
        hash.each do |k, v|
          # deal with the :only / :except options to :convert_values_to_numeric
          next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)

          # convert if it's a numeric value:
          case v
          when /^[+-]?\d+\.\d+$/
            hash[k] = v.to_f
          when /^[+-]?\d+$/
            hash[k] = v.to_i
          end
        end
      end

      if options[:value_converters]
        hash.each do |k, v|
          converter = options[:value_converters][k]
          next unless converter

          hash[k] = converter.convert(v)
        end
      end

      next if options[:remove_empty_hashes] && hash.empty?

      hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]

      if use_chunks
        chunk << hash # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
          # do something with the chunk
          if block_given?
            yield chunk # do something with the hashes in the chunk in the block
          else
            result << chunk # not sure yet, why anybody would want to do this without a block
          end
          chunk_count += 1
          chunk = [] # initialize for next chunk of data
        else

          # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)

        end

        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash] # do something with the hash in the block (better to use chunking here)
        else
          result << hash
        end
      end
    end

    # print new line to retain last processing line message
    print "\n" if options[:verbose]

    # last chunk:
    if !chunk.nil? && chunk.size > 0
      # do something with the chunk
      if block_given?
        yield chunk # do something with the hashes in the chunk in the block
      else
        result << chunk # not sure yet, why anybody would want to do this without a block
      end
      chunk_count += 1
      # chunk = [] # initialize for next chunk of data
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end
  if block_given?
    chunk_count # when we do processing through a block we only care how many chunks we processed
  else
    result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end

.raw_header ⇒ `Object`



190
191
192

# File 'lib/smarter_csv.rb', line 190

def raw_header
  @raw_header
end

Module: SmarterCSV

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.count_quote_chars(line, quote_char) ⇒ Object

.has_acceleration? ⇒ Boolean

.headers ⇒ Object

.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ Object