Class: DataDuck::Table
- Inherits:
-
Object
- Object
- DataDuck::Table
- Defined in:
- lib/dataduck/table.rb
Class Attribute Summary collapse
-
.actions ⇒ Object
Returns the value of attribute actions.
-
.output_schema ⇒ Object
Returns the value of attribute output_schema.
-
.sources ⇒ Object
Returns the value of attribute sources.
Instance Attribute Summary collapse
-
#data ⇒ Object
Returns the value of attribute data.
-
#errors ⇒ Object
Returns the value of attribute errors.
Class Method Summary collapse
- .output(schema) ⇒ Object
- .source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ Object
- .transforms(transformation_name) ⇒ Object
- .validates(validation_name) ⇒ Object
Instance Method Summary collapse
- #actions ⇒ Object
- #batch_size ⇒ Object
- #building_name ⇒ Object
- #check_table_valid! ⇒ Object
- #distribution_key ⇒ Object
- #etl!(destinations) ⇒ Object
- #extract!(destination = nil) ⇒ Object
- #extract_by_column ⇒ Object
- #extract_query(source_spec, destination = nil) ⇒ Object
- #indexes ⇒ Object
- #name ⇒ Object
- #output_column_names ⇒ Object
- #output_schema ⇒ Object
- #should_fully_reload? ⇒ Boolean
- #show ⇒ Object
- #staging_name ⇒ Object
- #transform! ⇒ Object
Class Attribute Details
.actions ⇒ Object
Returns the value of attribute actions.
8 9 10 |
# File 'lib/dataduck/table.rb', line 8 def actions @actions end |
.output_schema ⇒ Object
Returns the value of attribute output_schema.
7 8 9 |
# File 'lib/dataduck/table.rb', line 7 def output_schema @output_schema end |
.sources ⇒ Object
Returns the value of attribute sources.
6 7 8 |
# File 'lib/dataduck/table.rb', line 6 def sources @sources end |
Instance Attribute Details
#data ⇒ Object
Returns the value of attribute data.
11 12 13 |
# File 'lib/dataduck/table.rb', line 11 def data @data end |
#errors ⇒ Object
Returns the value of attribute errors.
12 13 14 |
# File 'lib/dataduck/table.rb', line 12 def errors @errors end |
Class Method Details
.output(schema) ⇒ Object
42 43 44 45 |
# File 'lib/dataduck/table.rb', line 42 def self.output(schema) self.output_schema ||= {} self.output_schema.merge!(schema) end |
.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/dataduck/table.rb', line 26 def self.source(source_name, source_table_or_query = nil, source_columns = nil) self.sources ||= [] source_spec = {} if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ') source_spec = {query: source_table_or_query} elsif source_columns.nil? && source_table_or_query.respond_to?(:each) source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)} else source_spec = {columns: source_columns, table_name: source_table_or_query.to_s} end source_spec[:source] = DataDuck::Source.source(source_name) self.sources << source_spec end |
.transforms(transformation_name) ⇒ Object
14 15 16 17 |
# File 'lib/dataduck/table.rb', line 14 def self.transforms(transformation_name) self.actions ||= [] self.actions << [:transform, transformation_name] end |
.validates(validation_name) ⇒ Object
20 21 22 23 |
# File 'lib/dataduck/table.rb', line 20 def self.validates(validation_name) self.actions ||= [] self.actions << [:validate, validation_name] end |
Instance Method Details
#actions ⇒ Object
47 48 49 |
# File 'lib/dataduck/table.rb', line 47 def actions self.class.actions end |
#batch_size ⇒ Object
149 150 151 |
# File 'lib/dataduck/table.rb', line 149 def batch_size nil end |
#building_name ⇒ Object
163 164 165 |
# File 'lib/dataduck/table.rb', line 163 def building_name self.should_fully_reload? ? self.staging_name : self.name end |
#check_table_valid! ⇒ Object
51 52 53 54 55 56 |
# File 'lib/dataduck/table.rb', line 51 def check_table_valid! if !self.batch_size.nil? raise Exception.new("Table #{ self.name }'s batch_size must be > 0") unless self.batch_size > 0 raise Exception.new("Table #{ self.name } has batch_size defined but no extract_by_column") if self.extract_by_column.nil? end end |
#distribution_key ⇒ Object
58 59 60 61 62 63 64 |
# File 'lib/dataduck/table.rb', line 58 def distribution_key if self.output_column_names.include?("id") "id" else nil end end |
#etl!(destinations) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/dataduck/table.rb', line 66 def etl!(destinations) if destinations.length != 1 raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.") end self.check_table_valid! destination = destinations.first if self.should_fully_reload? destination.drop_staging_table!(self) end batch_number = 0 while batch_number < 1_000 batch_number += 1 self.extract!(destination) self.transform! destination.load_table!(self) if self.batch_size.nil? break else if self.batch_size == self.data.length DataDuck::Logs.info "Finished batch #{ batch_number }, continuing with the next batch" else DataDuck::Logs.info "Finished batch #{ batch_number } (last batch)" break end end end self.data = [] if self.should_fully_reload? destination.finish_fully_reloading_table!(self) end end |
#extract!(destination = nil) ⇒ Object
103 104 105 106 107 108 109 110 111 112 113 114 115 |
# File 'lib/dataduck/table.rb', line 103 def extract!(destination = nil) DataDuck::Logs.info "Extracting table #{ self.name }" self.errors ||= [] self.data = [] self.class.sources.each do |source_spec| source = source_spec[:source] my_query = self.extract_query(source_spec, destination) results = source.query(my_query) self.data = results end self.data end |
#extract_by_column ⇒ Object
153 154 155 156 157 |
# File 'lib/dataduck/table.rb', line 153 def extract_by_column return 'updated_at' if self.output_column_names.include?("updated_at") nil end |
#extract_query(source_spec, destination = nil) ⇒ Object
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/dataduck/table.rb', line 117 def extract_query(source_spec, destination = nil) escape_char = source_spec[:source].escape_char base_query = source_spec.has_key?(:query) ? source_spec[:query] : "SELECT #{ escape_char }#{ source_spec[:columns].sort.join(escape_char + ',' + escape_char) }#{ escape_char } FROM #{ source_spec[:table_name] }" extract_by_clause = "" limit_clause = "" if self.extract_by_column if destination.table_names.include?(self.building_name) extract_by_value = destination.query("SELECT MAX(#{ self.extract_by_column }) AS val FROM #{ self.building_name }").first extract_by_value = extract_by_value.nil? ? nil : extract_by_value[:val] if extract_by_value extract_by_clause = "WHERE #{ self.extract_by_column } >= '#{ extract_by_value }'" end end limit_clause = self.batch_size ? "ORDER BY #{ self.extract_by_column } LIMIT #{ self.batch_size }" : "" end [base_query, extract_by_clause, limit_clause].join(' ').strip end |
#indexes ⇒ Object
142 143 144 145 146 147 |
# File 'lib/dataduck/table.rb', line 142 def indexes which_columns = [] which_columns << "id" if self.output_column_names.include?("id") which_columns << "created_at" if self.output_column_names.include?("created_at") which_columns end |
#name ⇒ Object
214 215 216 |
# File 'lib/dataduck/table.rb', line 214 def name DataDuck::Util.camelcase_to_underscore(self.class.name) end |
#output_column_names ⇒ Object
175 176 177 |
# File 'lib/dataduck/table.rb', line 175 def output_column_names self.output_schema.keys.sort.map(&:to_s) end |
#output_schema ⇒ Object
171 172 173 |
# File 'lib/dataduck/table.rb', line 171 def output_schema self.class.output_schema || {} end |
#should_fully_reload? ⇒ Boolean
159 160 161 |
# File 'lib/dataduck/table.rb', line 159 def should_fully_reload? false # Set to true if you want to fully reload a table with each ETL end |
#show ⇒ Object
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/dataduck/table.rb', line 179 def show puts "Table #{ self.name }" self.class.sources.each do |source_spec| puts "\nSources from #{ source_spec[:table_name] || source_spec[:query] } on #{ source_spec[:source].name }" source_spec[:columns].each do |col_name| puts " #{ col_name }" end end puts "\nOutputs " num_separators = self.output_schema.keys.map { |key| key.length }.max self.output_schema.each_pair do |name, datatype| puts " #{ name }#{ ' ' * (num_separators + 2 - name.length) }#{ datatype }" end end |
#staging_name ⇒ Object
167 168 169 |
# File 'lib/dataduck/table.rb', line 167 def staging_name "zz_dataduck_#{ self.name }" end |
#transform! ⇒ Object
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
# File 'lib/dataduck/table.rb', line 195 def transform! DataDuck::Logs.info "Transforming table #{ self.name }" self.errors ||= [] self.class.actions ||= [] self.class.actions.each do |action| action_type = action[0] action_method_name = action[1] if action_type == :transform self.data.map! { |row| self.public_send(action_method_name, row) } elsif action_type == :validate self.data.each do |row| error = self.public_send(action_method_name, row) self.errors << error if !error.blank? end end end end |