Class: DataKit::CSV::SchemaAnalyzer

Inherits:
Object
  • Object
show all
Defined in:
lib/data_kit/csv/schema_analyzer.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(csv, options = {}) ⇒ SchemaAnalyzer

Returns a new instance of SchemaAnalyzer.



8
9
10
11
12
# File 'lib/data_kit/csv/schema_analyzer.rb', line 8

def initialize(csv, options = {})
  @csv = csv
  @keys = options[:keys] || []
  @sampling_rate = options[:sampling_rate] || 0.1
end

Instance Attribute Details

#csvObject

Returns the value of attribute csv.



4
5
6
# File 'lib/data_kit/csv/schema_analyzer.rb', line 4

def csv
  @csv
end

#keysObject

Returns the value of attribute keys.



5
6
7
# File 'lib/data_kit/csv/schema_analyzer.rb', line 5

def keys
  @keys
end

#sampling_rateObject

Returns the value of attribute sampling_rate.



6
7
8
# File 'lib/data_kit/csv/schema_analyzer.rb', line 6

def sampling_rate
  @sampling_rate
end

Class Method Details

.analyze(csv, options = {}) ⇒ Object



33
34
35
36
37
38
39
40
# File 'lib/data_kit/csv/schema_analyzer.rb', line 33

def analyze(csv, options = {})
  analyzer = new(csv,
    :keys => options[:keys],
    :sampling_rate => options[:sampling_rate]
  )

  analyzer.execute
end

.sampling_rate(file_size) ⇒ Object



42
43
44
45
46
47
48
49
# File 'lib/data_kit/csv/schema_analyzer.rb', line 42

def sampling_rate(file_size)
  if file_size < (1024 * 1024)
    sampling_rate = 1.0
  else
    scale_factor = 500
    sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
  end
end

Instance Method Details

#executeObject



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/data_kit/csv/schema_analyzer.rb', line 14

def execute
  random = Random.new
  analysis = SchemaAnalysis.new(csv.headers)

  csv.each_row do |row|
    analysis.increment_total
    if random.rand <= sampling_rate
      analysis.increment_sample
      row.keys.each do |field_name|
        row[field_name].force_encoding('UTF-8')
        analysis.insert(field_name.to_s, row[field_name])
      end
    end
  end

  analysis
end