Module: SmarterCSV

Defined in:: lib/smarter_csv/parse.rb,
lib/smarter_csv/file_io.rb,
lib/smarter_csv/headers.rb,
lib/smarter_csv/version.rb,
lib/smarter_csv/variables.rb,
lib/smarter_csv/smarter_csv.rb,
lib/smarter_csv/auto_detection.rb,
lib/smarter_csv/header_validations.rb,
lib/smarter_csv/options_processing.rb,
lib/smarter_csv/hash_transformations.rb,
lib/smarter_csv/header_transformations.rb,
ext/smarter_csv/smarter_csv.c

Defined Under Namespace

Classes: DuplicateHeaders, HeaderSizeMismatch, IncorrectOption, KeyMappingError, MissingKeys, NoColSepDetected, SmarterCSVException, ValidationError

Constant Summary collapse

VERSION =

"1.10.3"

DEFAULT_OPTIONS =

{
  acceleration: true,
  auto_row_sep_chars: 500,
  chunk_size: nil,
  col_sep: :auto, # was: ',',
  comment_regexp: nil, # was: /\A#/,
  convert_values_to_numeric: true,
  downcase_header: true,
  duplicate_header_suffix: '', # was: nil,
  file_encoding: 'utf-8',
  force_simple_split: false,
  force_utf8: false,
  headers_in_file: true,
  invalid_byte_sequence: '',
  keep_original_headers: false,
  key_mapping: nil,
  quote_char: '"',
  remove_empty_hashes: true,
  remove_empty_values: true,
  remove_unmapped_keys: false,
  remove_values_matching: nil,
  remove_zero_values: false,
  required_headers: nil,
  required_keys: nil,
  row_sep: :auto, # was: $/,
  silence_missing_keys: false,
  skip_lines: nil,
  strings_as_keys: false,
  strip_chars_from_headers: nil,
  strip_whitespace: true,
  user_provided_headers: nil,
  value_converters: nil,
  verbose: false,
  with_line_numbers: false,
}.freeze

Class Attribute Summary collapse

.chunk_count ⇒ Object readonly

Returns the value of attribute chunk_count.
.csv_line_count ⇒ Object readonly

Returns the value of attribute csv_line_count.
.errors ⇒ Object readonly

Returns the value of attribute errors.
.file_line_count ⇒ Object readonly

Returns the value of attribute file_line_count.
.has_rails ⇒ Object readonly

Returns the value of attribute has_rails.
.headers ⇒ Object readonly

Returns the value of attribute headers.
.raw_header ⇒ Object readonly

Returns the value of attribute raw_header.
.result ⇒ Object readonly

Returns the value of attribute result.
.warnings ⇒ Object readonly

Returns the value of attribute warnings.

Class Method Summary collapse

.check_duplicate_headers(headers, _options) ⇒ Object
.check_required_headers(headers, options) ⇒ Object
.count_quote_chars(line, quote_char) ⇒ Object
.default_options ⇒ Object

NOTE: this is not called when “parse” methods are tested by themselves.
.disambiguate_headers(headers, options) ⇒ Object
.has_acceleration? ⇒ Boolean
.hash_transformations(hash, options) ⇒ Object
.header_transformations(header_array, options) ⇒ Object

transform the headers that were in the file:.
.header_validations(headers, options) ⇒ Object
.headerA ⇒ Object

:nocov: rubocop:disable Naming/MethodName.
.initialize_variables ⇒ Object
.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ Object

max_size: pass nil if no limit is specified.
.process(input, given_options = {}, &block) ⇒ Object

first parameter: filename or input object which responds to readline method.
.process_headers(filehandle, options) ⇒ Object
.process_options(given_options = {}) ⇒ Object

NOTE: this is not called when “parse” methods are tested by themselves.
.remap_headers(headers, options) ⇒ Object

do some key mapping on the keys in the file header if you want to completely delete a key, then map it to nil or to ”.

Class Attribute Details

.chunk_count ⇒ `Object` (readonly)

Returns the value of attribute chunk_count.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def chunk_count
  @chunk_count
end

.csv_line_count ⇒ `Object` (readonly)

Returns the value of attribute csv_line_count.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def csv_line_count
  @csv_line_count
end

.errors ⇒ `Object` (readonly)

Returns the value of attribute errors.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def errors
  @errors
end

.file_line_count ⇒ `Object` (readonly)

Returns the value of attribute file_line_count.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def file_line_count
  @file_line_count
end

.has_rails ⇒ `Object` (readonly)

Returns the value of attribute has_rails.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def has_rails
  @has_rails
end

.headers ⇒ `Object` (readonly)

Returns the value of attribute headers.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def headers
  @headers
end

.raw_header ⇒ `Object` (readonly)

Returns the value of attribute raw_header.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def raw_header
  @raw_header
end

.result ⇒ `Object` (readonly)

Returns the value of attribute result.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def result
  @result
end

.warnings ⇒ `Object` (readonly)

Returns the value of attribute warnings.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def warnings
  @warnings
end

Class Method Details

.check_duplicate_headers(headers, _options) ⇒ `Object`

# File 'lib/smarter_csv/header_validations.rb', line 10

def check_duplicate_headers(headers, _options)
  header_counts = Hash.new(0)
  headers.each { |header| header_counts[header] += 1 unless header.nil? }

  duplicates = header_counts.select { |_, count| count > 1 }

  unless duplicates.empty?
    raise(SmarterCSV::DuplicateHeaders, "Duplicate Headers in CSV: #{duplicates.inspect}")
  end
end

.check_required_headers(headers, options) ⇒ `Object`

# File 'lib/smarter_csv/header_validations.rb', line 23

def check_required_headers(headers, options)
  if options[:required_keys] && options[:required_keys].is_a?(Array)
    headers_set = headers.to_set
    missing_keys = options[:required_keys].select { |k| !headers_set.include?(k) }

    unless missing_keys.empty?
      raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `SmarterCSV.headers` for original headers."
    end
  end
end

.count_quote_chars(line, quote_char) ⇒ `Object`

# File 'lib/smarter_csv/smarter_csv.rb', line 168

def count_quote_chars(line, quote_char)
  return 0 if line.nil? || quote_char.nil? || quote_char.empty?

  count = 0
  escaped = false

  line.each_char do |char|
    if char == '\\' && !escaped
      escaped = true
    else
      count += 1 if char == quote_char && !escaped
      escaped = false
    end
  end

  count
end

.default_options ⇒ `Object`

NOTE: this is not called when “parse” methods are tested by themselves

ONLY FOR BACKWARDS-COMPATIBILITY



59
60
61

# File 'lib/smarter_csv/options_processing.rb', line 59

def default_options
  DEFAULT_OPTIONS
end

.disambiguate_headers(headers, options) ⇒ `Object`

# File 'lib/smarter_csv/header_transformations.rb', line 25

def disambiguate_headers(headers, options)
  counts = Hash.new(0)
  headers.map do |header|
    counts[header] += 1
    counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
  end
end

.has_acceleration? ⇒ `Boolean`

Returns:

(Boolean)



186
187
188

# File 'lib/smarter_csv/smarter_csv.rb', line 186

def has_acceleration?
  @has_acceleration ||= !!defined?(parse_csv_line_c)
end

.hash_transformations(hash, options) ⇒ `Object`

# File 'lib/smarter_csv/hash_transformations.rb', line 5

def hash_transformations(hash, options)
  # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
  # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
  remove_empty_values = options[:remove_empty_values] == true
  remove_zero_values = options[:remove_zero_values]
  remove_values_matching = options[:remove_values_matching]
  convert_to_numeric = options[:convert_values_to_numeric]
  value_converters = options[:value_converters]

  hash.each_with_object({}) do |(k, v), new_hash|
    next if k.nil? || k == '' || k == :""
    next if remove_empty_values && (has_rails ? v.blank? : blank?(v))
    next if remove_zero_values && v.is_a?(String) && v =~ /^(0+|0+\.0+)$/ # values are Strings
    next if remove_values_matching && v =~ remove_values_matching

    # deal with the :only / :except options to :convert_values_to_numeric
    if convert_to_numeric && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
      if v =~ /^[+-]?\d+\.\d+$/
        v = v.to_f
      elsif v =~ /^[+-]?\d+$/
        v = v.to_i
      end
    end

    converter = value_converters[k] if value_converters
    v = converter.convert(v) if converter

    new_hash[k] = v
  end
end

.header_transformations(header_array, options) ⇒ `Object`

transform the headers that were in the file:

# File 'lib/smarter_csv/header_transformations.rb', line 6

def header_transformations(header_array, options)
  header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
  header_array.map!{|x| x.strip} if options[:strip_whitespace]

  unless options[:keep_original_headers]
    header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
    header_array.map!{|x| x.downcase} if options[:downcase_header]
  end

  # detect duplicate headers and disambiguate
  header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
  # symbolize headers
  header_array = header_array.map{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
  # doesn't make sense to re-map when we have user_provided_headers
  header_array = remap_headers(header_array, options) if options[:key_mapping]

  header_array
end

.header_validations(headers, options) ⇒ `Object`

# File 'lib/smarter_csv/header_validations.rb', line 5

def header_validations(headers, options)
  check_duplicate_headers(headers, options)
  check_required_headers(headers, options)
end

.headerA ⇒ `Object`

:nocov: rubocop:disable Naming/MethodName

# File 'lib/smarter_csv/variables.rb', line 23

def headerA
  warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
  @headerA
end

.initialize_variables ⇒ `Object`

# File 'lib/smarter_csv/variables.rb', line 7

def initialize_variables
  @has_rails = !!defined?(Rails)
  @csv_line_count = 0
  @chunk_count = 0
  @errors = {}
  @file_line_count = 0
  @headerA = []
  @headers = nil
  @raw_header = nil # header as it appears in the file
  @result = []
  @warnings = {}
  @enforce_utf8 = false # only set to true if needed (after options parsing)
end

.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ `Object`

max_size: pass nil if no limit is specified

# File 'ext/smarter_csv/smarter_csv.c', line 15

static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
  if (RB_TYPE_P(line, T_NIL) == 1) {
    return rb_ary_new();
  }

  if (RB_TYPE_P(line, T_STRING) != 1) {
    rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
  }

  rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
  char *startP = RSTRING_PTR(line); /* may not be null terminated */
  long line_len = RSTRING_LEN(line);
  char *endP = startP + line_len ; /* points behind the string */
  char *p = startP;

  char *col_sepP = RSTRING_PTR(col_sep);
  long col_sep_len = RSTRING_LEN(col_sep);

  char *quoteP = RSTRING_PTR(quote_char);
  long quote_count = 0;

  bool col_sep_found = true;

  VALUE elements = rb_ary_new();
  VALUE field;
  long i;

  char prev_char = '\0'; // Store the previous character for comparison against an escape character
  long backslash_count = 0; // to count consecutive backslash characters

  while (p < endP) {
    /* does the remaining string start with col_sep ? */
    col_sep_found = true;
    for(i=0; (i < col_sep_len) && (p+i < endP) ; i++) {
      col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
    }
    /* if col_sep was found and we have even quotes */
    if (col_sep_found && (quote_count % 2 == 0)) {
      /* if max_size != nil && lements.size >= header_size */
      if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
        break;
      } else {
        /* push that field with original encoding onto the results */
        field = rb_enc_str_new(startP, p - startP, encoding);
        rb_ary_push(elements, field);

        p += col_sep_len;
        startP = p;
      }
    } else {
      if (*p == '\\') {
        backslash_count++;
      } else {
        if (*p == *quoteP && (backslash_count % 2 == 0)) {
          quote_count++;
        }
        backslash_count = 0; // no more consecutive backslash characters
      }
      p++;
    }

    prev_char = *(p - 1); // Update the previous character
  } /* while */

  /* check if the last part of the line needs to be processed */
  if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
    /* copy the remaining line as a field with original encoding onto the results */
    field = rb_enc_str_new(startP, endP - startP, encoding);
    rb_ary_push(elements, field);
  }

  return elements;
}

.process(input, given_options = {}, &block) ⇒ `Object`

first parameter: filename or input object which responds to readline method

# File 'lib/smarter_csv/smarter_csv.rb', line 14

def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
  initialize_variables

  options = process_options(given_options)

  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
  @verbose = options[:verbose]

  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
      puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
    end

    # auto-detect the row separator
    options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

    skip_lines(fh, options)

    @headers, header_size = process_headers(fh, options)
    @headerA = @headers # @headerA is deprecated, use @headers

    puts "Effective headers:\n#{pp(@headers)}\n" if @verbose

    header_validations(@headers, options)

    # in case we use chunking.. we'll need to set it up..
    if options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      @chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # now on to processing all the rest of the lines in the CSV file:
    # fh.each_line |line|
    until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
      line = readline_with_counts(fh, options)

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = enforce_utf8_encoding(line, options) if @enforce_utf8

      print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # cater for the quoted csv data containing the row separator carriage return character
      # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
      # by detecting the existence of an uneven number of quote characters
      multiline = count_quote_chars(line, options[:quote_char]).odd?

      while multiline
        next_line = fh.readline(options[:row_sep])
        next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
        line += next_line
        @file_line_count += 1

        break if fh.eof? # Exit loop if end of file is reached

        multiline = count_quote_chars(line, options[:quote_char]).odd?
      end

      # :nocov:
      if multiline && @verbose
        print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
      end
      # :nocov:

      line.chomp!(options[:row_sep])

      # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
      dataA, _data_size = parse(line, options, header_size)

      dataA.map!{|x| x.strip} if options[:strip_whitespace]

      # if all values are blank, then ignore this line
      next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))

      # --- HASH TRANSFORMATIONS ------------------------------------------------------------
      hash = @headers.zip(dataA).to_h

      hash = hash_transformations(hash, options)

      # --- HASH VALIDATIONS ----------------------------------------------------------------
      # will go here, and be able to:
      #  - validate correct format of the values for fields
      #  - required fields to be non-empty
      #  - ...
      # -------------------------------------------------------------------------------------

      next if options[:remove_empty_hashes] && hash.empty?

      puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
      # optional adding of csv_line_number to the hash to help debugging
      hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]

      # process the chunks or the resulting hash
      if use_chunks
        chunk << hash # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
          # do something with the chunk
          if block_given?
            yield chunk # do something with the hashes in the chunk in the block
          else
            @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
          end
          @chunk_count += 1
          chunk.clear # re-initialize for next chunk of data
        else
          # the last chunk may contain partial data, which is handled below
        end
        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash] # do something with the hash in the block (better to use chunking here)
        else
          @result << hash
        end
      end
    end

    # print new line to retain last processing line message
    print "\n" if @verbose

    # handling of last chunk:
    if !chunk.nil? && chunk.size > 0
      # do something with the chunk
      if block_given?
        yield chunk # do something with the hashes in the chunk in the block
      else
        @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
      end
      @chunk_count += 1
      # chunk = [] # initialize for next chunk of data
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end

  if block_given?
    @chunk_count # when we do processing through a block we only care how many chunks we processed
  else
    @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end

.process_headers(filehandle, options) ⇒ `Object`

# File 'lib/smarter_csv/headers.rb', line 5

def process_headers(filehandle, options)
  @raw_header = nil # header as it appears in the file
  @headers = nil # the processed headers
  header_array = []
  file_header_size = nil

  # if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
  if options[:headers_in_file] # extract the header line
    # process the header line in the CSV file..
    # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
    header_line = @raw_header = readline_with_counts(filehandle, options)
    header_line = preprocess_header_line(header_line, options)

    file_header_array, file_header_size = parse(header_line, options)

    file_header_array = header_transformations(file_header_array, options)

  else
    unless options[:user_provided_headers]
      raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
    end
  end

  if options[:user_provided_headers]
    unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
      raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
    end

    # use user-provided headers
    user_header_array = options[:user_provided_headers]
    # user_provided_headers: their count should match the headers_in_file if any
    if defined?(file_header_size) && !file_header_size.nil?
      if user_header_array.size != file_header_size
        raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers !=  CSV-file has #{file_header_size} headers"
      else
        # we could print out the mapping of file_header_array to header_array here
      end
    end

    header_array = user_header_array
  else
    header_array = file_header_array
  end

  [header_array, header_array.size]
end

.process_options(given_options = {}) ⇒ `Object`

NOTE: this is not called when “parse” methods are tested by themselves

# File 'lib/smarter_csv/options_processing.rb', line 42

def process_options(given_options = {})
  puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]

  @options = DEFAULT_OPTIONS.dup.merge!(given_options)

  # fix invalid input
  @options[:invalid_byte_sequence] ||= ''

  puts "Computed options:\n#{pp(@options)}\n" if @options[:verbose]

  validate_options!(@options)
  @options
end

.remap_headers(headers, options) ⇒ `Object`

do some key mapping on the keys in the file header if you want to completely delete a key, then map it to nil or to ”

# File 'lib/smarter_csv/header_transformations.rb', line 35

def remap_headers(headers, options)
  key_mapping = options[:key_mapping]
  if key_mapping.empty? || !key_mapping.is_a?(Hash) || key_mapping.keys.empty?
    raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for key_mapping! Expecting hash with from -> to mappings")
  end

  key_mapping = options[:key_mapping]
  # if silence_missing_keys are not set, raise error if missing header
  missing_keys = key_mapping.keys - headers
  # if the user passes a list of speciffic mapped keys that are optional
  missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)

  unless missing_keys.empty? || options[:silence_missing_keys] == true
    raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
  end

  headers.map! do |header|
    if key_mapping.has_key?(header)
      key_mapping[header].nil? ? nil : key_mapping[header]
    elsif options[:remove_unmapped_keys]
      nil
    else
      header
    end
  end
  headers
end

Module: SmarterCSV

Defined Under Namespace

Constant Summary collapse

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.chunk_count ⇒ Object (readonly)

.csv_line_count ⇒ Object (readonly)

.errors ⇒ Object (readonly)

.file_line_count ⇒ Object (readonly)

.has_rails ⇒ Object (readonly)

.headers ⇒ Object (readonly)

.raw_header ⇒ Object (readonly)

.result ⇒ Object (readonly)

.warnings ⇒ Object (readonly)

Class Method Details

.check_duplicate_headers(headers, _options) ⇒ Object

.check_required_headers(headers, options) ⇒ Object

.count_quote_chars(line, quote_char) ⇒ Object

.default_options ⇒ Object

.disambiguate_headers(headers, options) ⇒ Object

.has_acceleration? ⇒ Boolean

.hash_transformations(hash, options) ⇒ Object

.header_transformations(header_array, options) ⇒ Object

.header_validations(headers, options) ⇒ Object

.headerA ⇒ Object

.initialize_variables ⇒ Object

.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ Object

.process(input, given_options = {}, &block) ⇒ Object

.process_headers(filehandle, options) ⇒ Object

.process_options(given_options = {}) ⇒ Object

.remap_headers(headers, options) ⇒ Object