Class: Embulk::Guess::CsvGuessPlugin

Inherits:

LineGuessPlugin

Object
Embulk::GuessPlugin
LineGuessPlugin
Embulk::Guess::CsvGuessPlugin

show all

Defined in:: lib/embulk/guess/csv.rb

Constant Summary collapse

DELIMITER_CANDIDATES =

[
  ",", "\t", "|"
]

QUOTE_CANDIDATES =

[
  "\"", "'"
]

ESCAPE_CANDIDATES =

[
  "\\"
]

NULL_STRING_CANDIDATES =

[
  "null",
  "NULL",
  "#N/A",
  "\\N",  # MySQL LOAD, Hive STORED AS TEXTFILE
]

MAX_SKIP_LINES =

NO_SKIP_DETECT_LINES =

Instance Method Summary collapse

#guess_lines(config, sample_lines) ⇒ Object

Methods inherited from LineGuessPlugin

#guess

Methods inherited from Embulk::GuessPlugin

from_java, #guess, new_java

Instance Method Details

#guess_lines(config, sample_lines) ⇒ `Object`

# File 'lib/embulk/guess/csv.rb', line 30

def guess_lines(config, sample_lines)
  return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"

  delim = guess_delimiter(sample_lines)
  unless delim
    # not CSV file
    return {}
  end

  parser_config = config["parser"] || {}
  parser_guessed = DataSource.new.merge({"type" => "csv", "delimiter" => delim})

  quote = guess_quote(sample_lines, delim)
  parser_guessed["quote"] = quote ? quote : ''

  escape = guess_escape(sample_lines, delim, quote)
  parser_guessed["escape"] = escape ? escape : ''

  null_string = guess_null_string(sample_lines, delim)
  parser_guessed["null_string"] = null_string if null_string
  # don't even set null_string to avoid confusion of null and 'null' in YAML format

  sample_records = split_lines(parser_guessed, sample_lines, delim)
  skip_header_lines = guess_skip_header_lines(sample_records)
  sample_records = sample_records[skip_header_lines..-1]

  first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
  other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])

  if first_types.size <= 1 || other_types.size <= 1
    # guess failed
    return {}
  end

  header_line = (first_types != other_types && !first_types.any? {|t| t != "string" })

  if header_line
    parser_guessed["skip_header_lines"] = skip_header_lines + 1
  else
    parser_guessed["skip_header_lines"] = skip_header_lines
  end

  unless parser_config.has_key?("columns")
    if header_line
      column_names = sample_records.first
    else
      column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
    end
    schema = []
    column_names.zip(other_types).each do |name,type|
      if name && type
        if type.is_a?(SchemaGuess::TimestampTypeMatch)
          schema << {"name" => name, "type" => type, "format" => type.format}
        else
          schema << {"name" => name, "type" => type}
        end
      end
    end
    parser_guessed["columns"] = schema
  end

  return {"parser" => parser_guessed}
end

Class: Embulk::Guess::CsvGuessPlugin

Constant Summary collapse

Instance Method Summary collapse

Methods inherited from LineGuessPlugin

Methods inherited from Embulk::GuessPlugin

Instance Method Details

#guess_lines(config, sample_lines) ⇒ Object

#guess_lines(config, sample_lines) ⇒ `Object`