30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# File 'lib/embulk/guess/csv.rb', line 30
def guess_lines(config, sample_lines)
return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"
delim = guess_delimiter(sample_lines)
unless delim
return {}
end
parser_config = config["parser"] || {}
parser_guessed = DataSource.new.merge({"type" => "csv", "delimiter" => delim})
quote = guess_quote(sample_lines, delim)
parser_guessed["quote"] = quote ? quote : ''
escape = guess_escape(sample_lines, delim, quote)
parser_guessed["escape"] = escape ? escape : ''
null_string = guess_null_string(sample_lines, delim)
parser_guessed["null_string"] = null_string if null_string
sample_records = split_lines(parser_guessed, sample_lines, delim)
= (sample_records)
sample_records = sample_records[..-1]
first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
if first_types.size <= 1 || other_types.size <= 1
return {}
end
= (first_types != other_types && !first_types.any? {|t| t != "string" })
if
parser_guessed["skip_header_lines"] = + 1
else
parser_guessed["skip_header_lines"] =
end
unless parser_config.has_key?("columns")
if
column_names = sample_records.first
else
column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
end
schema = []
column_names.zip(other_types).each do |name,type|
if name && type
if type.is_a?(SchemaGuess::TimestampTypeMatch)
schema << {"name" => name, "type" => type, "format" => type.format}
else
schema << {"name" => name, "type" => type}
end
end
end
parser_guessed["columns"] = schema
end
return {"parser" => parser_guessed}
end
|