Module: Yannitor::Broom

Defined in:
lib/yannitor/cleaner.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#yannitor_featuresObject

Returns the value of attribute yannitor_features.



7
8
9
# File 'lib/yannitor/cleaner.rb', line 7

def yannitor_features
  @yannitor_features
end

Instance Method Details

#build_linear_featuresObject



36
37
38
39
40
41
42
# File 'lib/yannitor/cleaner.rb', line 36

def build_linear_features
  all.map do |obj|
    obj.class.yannitor_features[:linear].map do |feature|
      obj.send("n#{feature}").to_f
    end
  end
end

#linear_feature_selectObject



44
45
46
47
48
# File 'lib/yannitor/cleaner.rb', line 44

def linear_feature_select
  yannitor_features[:linear].map do |feature|
    "CAST(#{min_max(feature)} AS float) as n#{feature}"
  end.join(', ')
end

#min_val(feature) ⇒ Object



50
51
52
53
54
55
# File 'lib/yannitor/cleaner.rb', line 50

def min_val(feature)
  min = all.minimum(feature)
  max = all.maximum(feature)

  "(#{table_name}.#{feature}::float - #{min}::float) / (#{max}::float - #{min}::float)"
end

#nelect(feature) ⇒ Object



57
58
59
# File 'lib/yannitor/cleaner.rb', line 57

def nelect(feature)
  select("*, #{min_max(feature)}::float as n#{feature}")
end

#normalize(feature) ⇒ Object



61
62
63
64
65
66
67
68
69
# File 'lib/yannitor/cleaner.rb', line 61

def normalize(feature)
  min = all.minimum(feature)
  max = all.maximum(feature)
  data = all.nelect(feature).map do |e|
    e.send("n#{feature}".to_sym)
  end

  [data, min, max]
end

#to_file(file_name = 'data.csv', separator = ' ') ⇒ Object



71
72
73
74
75
# File 'lib/yannitor/cleaner.rb', line 71

def to_file(file_name = 'data.csv', separator = ' ')
  CSV.open(file_name, 'wb', col_sep: separator) do |csv|
    all.vectorize.each { |v| csv << v }
  end
end

#to_one_hot(target_column, type = 'text') ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/yannitor/cleaner.rb', line 13

def to_one_hot(target_column, type = 'text')
  self.select(%(
    #{table_name}.id,
    ARRAY_AGG(CASE
      WHEN sorted_value_table.value::#{type} = #{table_name}.#{target_column}::#{type}
      THEN 1
      ELSE 0
      END
    ) AS o#{target_column}
  )).joins(%(
    LEFT JOIN (#{values_for_select(target_column)}) AS sorted_value_table ON 1=1
  )).group("#{table_name}.id")
end

#values_for_select(target_column) ⇒ Object



27
28
29
30
# File 'lib/yannitor/cleaner.rb', line 27

def values_for_select(target_column)
  sorted_values = pluck("distinct(#{target_column})").join("'), ('")
  "SELECT value FROM (values ('#{sorted_values}')) s(value)"
end

#vectorizeObject



32
33
34
# File 'lib/yannitor/cleaner.rb', line 32

def vectorize
  select('*, ' + linear_feature_select).build_linear_features
end

#yannitor_is_cleaning(features = {}) ⇒ Object



9
10
11
# File 'lib/yannitor/cleaner.rb', line 9

def yannitor_is_cleaning(features = {})
  self.yannitor_features = features
end