Class: DataKit::CSV::SchemaAnalysis

Inherits:
Object
  • Object
show all
Defined in:
lib/data_kit/csv/schema_analysis.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(fields, options = {}) ⇒ SchemaAnalysis

Returns a new instance of SchemaAnalysis.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/data_kit/csv/schema_analysis.rb', line 12

def initialize(fields, options = {})
  @fields, @types = fields, {}
  @row_count, @sample_count = 0, 0

  @type_hints = {}

  if options[:use_type_hints].nil? || options[:use_type_hints] == true
    @use_type_hints = true
  else
    @use_type_hints = false
  end

  fields.each do |field_name|
    @types[field_name] = {}
    @type_hints[field_name] = :string
    Dataset::Field::Types.each do |type|
      @types[field_name][type] = 0
    end
  end
end

Instance Attribute Details

#fieldsObject (readonly)

Returns the value of attribute fields.



4
5
6
# File 'lib/data_kit/csv/schema_analysis.rb', line 4

def fields
  @fields
end

#row_countObject (readonly)

Returns the value of attribute row_count.



6
7
8
# File 'lib/data_kit/csv/schema_analysis.rb', line 6

def row_count
  @row_count
end

#sample_countObject (readonly)

Returns the value of attribute sample_count.



7
8
9
# File 'lib/data_kit/csv/schema_analysis.rb', line 7

def sample_count
  @sample_count
end

#type_hintsObject (readonly)

Returns the value of attribute type_hints.



9
10
11
# File 'lib/data_kit/csv/schema_analysis.rb', line 9

def type_hints
  @type_hints
end

#typesObject (readonly)

Returns the value of attribute types.



5
6
7
# File 'lib/data_kit/csv/schema_analysis.rb', line 5

def types
  @types
end

#use_type_hintsObject (readonly)

Returns the value of attribute use_type_hints.



10
11
12
# File 'lib/data_kit/csv/schema_analysis.rb', line 10

def use_type_hints
  @use_type_hints
end

Instance Method Details

#field_typesObject



52
53
54
55
56
57
# File 'lib/data_kit/csv/schema_analysis.rb', line 52

def field_types
  fields.inject({}) do |result, field_name|
    result[field_name] = type?(field_name)
    result
  end
end

#has_only_numeric_types?(field) ⇒ Boolean

Returns:

  • (Boolean)


83
84
85
# File 'lib/data_kit/csv/schema_analysis.rb', line 83

def has_only_numeric_types?(field)
  (type_list(field) - [:integer, :number, :null]).length == 0
end

#has_single_type?(field) ⇒ Boolean

Returns:

  • (Boolean)


79
80
81
# File 'lib/data_kit/csv/schema_analysis.rb', line 79

def has_single_type?(field)
  (type_list(field) - [:null]).length == 1
end

#increment_sampleObject



37
38
39
# File 'lib/data_kit/csv/schema_analysis.rb', line 37

def increment_sample
  @sample_count += 1
end

#increment_totalObject



33
34
35
# File 'lib/data_kit/csv/schema_analysis.rb', line 33

def increment_total
  @row_count += 1
end

#insert(field_name, value) ⇒ Object



41
42
43
44
45
46
47
48
49
50
# File 'lib/data_kit/csv/schema_analysis.rb', line 41

def insert(field_name, value)
  if use_type_hints
    type = Dataset::Field.type?(value, type_hints[field_name])
    @type_hints[field_name] = type # cache the most recent type
  else
    type = Dataset::Field.type?(value)
  end

  @types[field_name][type] += 1
end

#type?(field) ⇒ Boolean

Returns:

  • (Boolean)


59
60
61
62
63
64
65
66
67
# File 'lib/data_kit/csv/schema_analysis.rb', line 59

def type?(field)
  if has_single_type?(field)
    type_list(field).first
  elsif has_only_numeric_types?(field)
    :number
  else
    :string
  end
end

#type_count(field, type) ⇒ Object



69
70
71
# File 'lib/data_kit/csv/schema_analysis.rb', line 69

def type_count(field, type)
  types[field][type] || 0
end

#type_list(field) ⇒ Object



73
74
75
76
77
# File 'lib/data_kit/csv/schema_analysis.rb', line 73

def type_list(field)
  types[field].keys.select do |type|
    type_count(field, type) > 0
  end
end