Class: Embulk::Guess::CsvGuessPlugin

Inherits:
LineGuessPlugin show all
Defined in:
lib/embulk/guess/csv.rb

Constant Summary collapse

DELIMITER_CANDIDATES =
[
  ",", "\t", "|"
]
QUOTE_CANDIDATES =
[
  "\"", "'"
]
ESCAPE_CANDIDATES =
[
  "\\"
]
NULL_STRING_CANDIDATES =
[
  "null",
  "NULL",
  "#N/A",
  "\\N",  # MySQL LOAD, Hive STORED AS TEXTFILE
]
MAX_SKIP_LINES =
10
NO_SKIP_DETECT_LINES =
10

Instance Method Summary collapse

Methods inherited from LineGuessPlugin

#guess

Methods inherited from Embulk::GuessPlugin

from_java, #guess, new_java

Instance Method Details

#guess_lines(config, sample_lines) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/embulk/guess/csv.rb', line 30

def guess_lines(config, sample_lines)
  return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"

  delim = guess_delimiter(sample_lines)
  unless delim
    # not CSV file
    return {}
  end

  parser_config = config["parser"] || {}
  parser_guessed = DataSource.new.merge({"type" => "csv", "delimiter" => delim})

  quote = guess_quote(sample_lines, delim)
  parser_guessed["quote"] = quote ? quote : ''

  escape = guess_escape(sample_lines, delim, quote)
  parser_guessed["escape"] = escape ? escape : ''

  null_string = guess_null_string(sample_lines, delim)
  parser_guessed["null_string"] = null_string if null_string
  # don't even set null_string to avoid confusion of null and 'null' in YAML format

  sample_records = split_lines(parser_guessed, sample_lines, delim)
  skip_header_lines = guess_skip_header_lines(sample_records)
  sample_records = sample_records[skip_header_lines..-1]

  first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
  other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])

  if first_types.size <= 1 || other_types.size <= 1
    # guess failed
    return {}
  end

  header_line = (first_types != other_types && !first_types.any? {|t| t != "string" })

  if header_line
    parser_guessed["skip_header_lines"] = skip_header_lines + 1
  else
    parser_guessed["skip_header_lines"] = skip_header_lines
  end

  unless parser_config.has_key?("columns")
    if header_line
      column_names = sample_records.first
    else
      column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
    end
    schema = []
    column_names.zip(other_types).each do |name,type|
      if name && type
        if type.is_a?(SchemaGuess::TimestampTypeMatch)
          schema << {"name" => name, "type" => type, "format" => type.format}
        else
          schema << {"name" => name, "type" => type}
        end
      end
    end
    parser_guessed["columns"] = schema
  end

  return {"parser" => parser_guessed}
end