Class: FlatKit::FieldStats

Inherits:
Object
  • Object
show all
Defined in:
lib/flat_kit/field_stats.rb

Overview

Collect stats on a single field. We may not know what the field data type is to start with, so collect a bunch of values until we have the threshold, and then calculte states based upon the data types determined by the guess

Constant Summary collapse

DEFAULT_GUESS_THRESHOLD =
1000
CORE_STATS =
:core
CARDINALITY_STATS =
:cardinality
ALL_STATS =
[ CORE_STATS, CARDINALITY_STATS ]
EXPORT_FIELDS =
%w[
  name
  type
  count
  max
  mean
  min
  stddev
  sum
  mode
  unique_count

  max_length
  mean_length
  min_length
  stddev_length
  mode_length
  unique_count_lengths

  null_count
  unknown_count
  out_of_type_count
  total_count
  null_percent
  unknown_percent
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD) ⇒ FieldStats

Returns a new instance of FieldStats.

Raises:

  • (ArgumentError)


46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/flat_kit/field_stats.rb', line 46

def initialize(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD)
  @name              = name
  @field_type        = type
  @guess_threshold   = guess_threshold
  @type_counts       = Hash.new(0)
  @out_of_type_count = 0
  @values            = []
  @stats             = nil
  @length_stats      = nil
  @stats_to_collect  = [stats_to_collect].flatten

  @stats_to_collect.each do |collection_set|
    next if ALL_STATS.include?(collection_set)
    raise ArgumentError, "#{collection_set} is not a valid stats collection set, must be one of #{ALL_STATS.map { |s| s.to_s }.join(", ") }"
  end
  raise ArgumentError, "type: must be FieldType subclasses - not #{type}" unless type.kind_of?(Class) && (type.superclass == ::FlatKit::FieldType)
end

Instance Attribute Details

#field_typeObject (readonly)

Returns the value of attribute field_type.



43
44
45
# File 'lib/flat_kit/field_stats.rb', line 43

def field_type
  @field_type
end

#nameObject (readonly)

Returns the value of attribute name.



44
45
46
# File 'lib/flat_kit/field_stats.rb', line 44

def name
  @name
end

#type_countsObject (readonly)

Returns the value of attribute type_counts.



42
43
44
# File 'lib/flat_kit/field_stats.rb', line 42

def type_counts
  @type_counts
end

Instance Method Details

#collecting_frequencies?Boolean

Returns:

  • (Boolean)


82
83
84
# File 'lib/flat_kit/field_stats.rb', line 82

def collecting_frequencies?
  @stats_to_collect.include?(CARDINALITY_STATS)
end

#countObject



90
91
92
# File 'lib/flat_kit/field_stats.rb', line 90

def count
  stats.count
end

#field_type_determined?Boolean

Returns:

  • (Boolean)


64
65
66
# File 'lib/flat_kit/field_stats.rb', line 64

def field_type_determined?
  @field_type != ::FlatKit::FieldType::GuessType
end

#frequenciesObject



126
127
128
# File 'lib/flat_kit/field_stats.rb', line 126

def frequencies
  stats.frequencies if collecting_frequencies?
end

#length_frequenciesObject



158
159
160
# File 'lib/flat_kit/field_stats.rb', line 158

def length_frequencies
  length_stats.frequencies if @length_stats && collecting_frequencies?
end

#maxObject



94
95
96
# File 'lib/flat_kit/field_stats.rb', line 94

def max
  stats.max if stats.respond_to?(:max)
end

#max_lengthObject



134
135
136
# File 'lib/flat_kit/field_stats.rb', line 134

def max_length
  length_stats.max if @length_stats
end

#meanObject



98
99
100
# File 'lib/flat_kit/field_stats.rb', line 98

def mean
  stats.mean if stats.respond_to?(:mean)
end

#mean_lengthObject



138
139
140
# File 'lib/flat_kit/field_stats.rb', line 138

def mean_length
  length_stats.mean if @length_stats
end

#minObject



102
103
104
# File 'lib/flat_kit/field_stats.rb', line 102

def min
  stats.min if stats.respond_to?(:min)
end

#min_lengthObject



130
131
132
# File 'lib/flat_kit/field_stats.rb', line 130

def min_length
  length_stats.min if @length_stats
end

#modeObject



114
115
116
# File 'lib/flat_kit/field_stats.rb', line 114

def mode
  stats.mode if collecting_frequencies?
end

#mode_lengthObject



146
147
148
# File 'lib/flat_kit/field_stats.rb', line 146

def mode_length
  length_stats.mode if @length_stats && collecting_frequencies?
end

#null_countObject



162
163
164
# File 'lib/flat_kit/field_stats.rb', line 162

def null_count
  type_counts[FieldType::NullType]
end

#null_percentObject



174
175
176
177
# File 'lib/flat_kit/field_stats.rb', line 174

def null_percent
  return 0 if total_count.zero?
  ((null_count.to_f / total_count) * 100.0).truncate(2)
end

#out_of_type_countObject



170
171
172
# File 'lib/flat_kit/field_stats.rb', line 170

def out_of_type_count
  @out_of_type_count
end

#stddevObject



106
107
108
# File 'lib/flat_kit/field_stats.rb', line 106

def stddev
  stats.stddev if stats.respond_to?(:stddev)
end

#stddev_lengthObject



142
143
144
# File 'lib/flat_kit/field_stats.rb', line 142

def stddev_length
  length_stats.stddev if @length_stats
end

#sumObject



110
111
112
# File 'lib/flat_kit/field_stats.rb', line 110

def sum
  stats.sum if stats.respond_to?(:sum)
end

#to_hashObject



188
189
190
191
192
193
194
195
196
# File 'lib/flat_kit/field_stats.rb', line 188

def to_hash
  resolve_guess

  Hash.new.tap do |h|
    EXPORT_FIELDS.each do |n|
      h[n] = self.send(n)
    end
  end
end

#total_countObject



166
167
168
# File 'lib/flat_kit/field_stats.rb', line 166

def total_count
  stats.count + @out_of_type_count
end

#typeObject



86
87
88
# File 'lib/flat_kit/field_stats.rb', line 86

def type
  @field_type.type_name
end

#unique_countObject



118
119
120
# File 'lib/flat_kit/field_stats.rb', line 118

def unique_count
  stats.unique_count if collecting_frequencies?
end

#unique_count_lengthsObject



150
151
152
# File 'lib/flat_kit/field_stats.rb', line 150

def unique_count_lengths
  length_stats.unique_count if @length_stats && collecting_frequencies?
end

#unique_valuesObject



122
123
124
# File 'lib/flat_kit/field_stats.rb', line 122

def unique_values
  stats.unique_values if collecting_frequencies?
end

#unique_values_lengthsObject



154
155
156
# File 'lib/flat_kit/field_stats.rb', line 154

def unique_values_lengths
  length_stats.unique_values if @length_stats && collecting_frequencies?
end

#unknown_countObject



179
180
181
# File 'lib/flat_kit/field_stats.rb', line 179

def unknown_count
  type_counts[FieldType::UnknownType]
end

#unknown_percentObject



183
184
185
186
# File 'lib/flat_kit/field_stats.rb', line 183

def unknown_percent
  return 0 if total_count.zero?
  ((unknown_count.to_f / total_count) * 100.0).truncate(2)
end

#update(value) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/flat_kit/field_stats.rb', line 68

def update(value)
  update_type_count(value)

  if field_type_determined? then
    update_stats(value)
  else
    @values << value

    if @values.size >= @guess_threshold then
      resolve_guess
    end
  end
end