Module: Yannitor::Broom
- Defined in:
- lib/yannitor/cleaner.rb
Instance Attribute Summary collapse
-
#yannitor_features ⇒ Object
Returns the value of attribute yannitor_features.
Instance Method Summary collapse
- #build_linear_features ⇒ Object
- #linear_feature_select ⇒ Object
- #min_val(feature) ⇒ Object
- #nelect(feature) ⇒ Object
- #normalize(feature) ⇒ Object
- #to_file(file_name = 'data.csv', separator = ' ') ⇒ Object
- #to_one_hot(target_column, type = 'text') ⇒ Object
- #values_for_select(target_column) ⇒ Object
- #vectorize ⇒ Object
- #yannitor_is_cleaning(features = {}) ⇒ Object
Instance Attribute Details
#yannitor_features ⇒ Object
Returns the value of attribute yannitor_features.
7 8 9 |
# File 'lib/yannitor/cleaner.rb', line 7 def yannitor_features @yannitor_features end |
Instance Method Details
#build_linear_features ⇒ Object
36 37 38 39 40 41 42 |
# File 'lib/yannitor/cleaner.rb', line 36 def build_linear_features all.map do |obj| obj.class.yannitor_features[:linear].map do |feature| obj.send("n#{feature}").to_f end end end |
#linear_feature_select ⇒ Object
44 45 46 47 48 |
# File 'lib/yannitor/cleaner.rb', line 44 def linear_feature_select yannitor_features[:linear].map do |feature| "CAST(#{min_max(feature)} AS float) as n#{feature}" end.join(', ') end |
#min_val(feature) ⇒ Object
50 51 52 53 54 55 |
# File 'lib/yannitor/cleaner.rb', line 50 def min_val(feature) min = all.minimum(feature) max = all.maximum(feature) "(#{table_name}.#{feature}::float - #{min}::float) / (#{max}::float - #{min}::float)" end |
#nelect(feature) ⇒ Object
57 58 59 |
# File 'lib/yannitor/cleaner.rb', line 57 def nelect(feature) select("*, #{min_max(feature)}::float as n#{feature}") end |
#normalize(feature) ⇒ Object
61 62 63 64 65 66 67 68 69 |
# File 'lib/yannitor/cleaner.rb', line 61 def normalize(feature) min = all.minimum(feature) max = all.maximum(feature) data = all.nelect(feature).map do |e| e.send("n#{feature}".to_sym) end [data, min, max] end |
#to_file(file_name = 'data.csv', separator = ' ') ⇒ Object
71 72 73 74 75 |
# File 'lib/yannitor/cleaner.rb', line 71 def to_file(file_name = 'data.csv', separator = ' ') CSV.open(file_name, 'wb', col_sep: separator) do |csv| all.vectorize.each { |v| csv << v } end end |
#to_one_hot(target_column, type = 'text') ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/yannitor/cleaner.rb', line 13 def to_one_hot(target_column, type = 'text') self.select(%( #{table_name}.id, ARRAY_AGG(CASE WHEN sorted_value_table.value::#{type} = #{table_name}.#{target_column}::#{type} THEN 1 ELSE 0 END ) AS o#{target_column} )).joins(%( LEFT JOIN (#{values_for_select(target_column)}) AS sorted_value_table ON 1=1 )).group("#{table_name}.id") end |
#values_for_select(target_column) ⇒ Object
27 28 29 30 |
# File 'lib/yannitor/cleaner.rb', line 27 def values_for_select(target_column) sorted_values = pluck("distinct(#{target_column})").join("'), ('") "SELECT value FROM (values ('#{sorted_values}')) s(value)" end |
#vectorize ⇒ Object
32 33 34 |
# File 'lib/yannitor/cleaner.rb', line 32 def vectorize select('*, ' + linear_feature_select).build_linear_features end |
#yannitor_is_cleaning(features = {}) ⇒ Object
9 10 11 |
# File 'lib/yannitor/cleaner.rb', line 9 def yannitor_is_cleaning(features = {}) self.yannitor_features = features end |