Class: DataDuck::Table

Inherits:

Object

Object
DataDuck::Table

show all

Defined in:: lib/dataduck/table.rb

Direct Known Subclasses

IntegrationTable

Class Attribute Summary collapse

.actions ⇒ Object

Returns the value of attribute actions.
.output_schema ⇒ Object

Returns the value of attribute output_schema.
.sources ⇒ Object

Returns the value of attribute sources.

Instance Attribute Summary collapse

#data ⇒ Object

Returns the value of attribute data.
#errors ⇒ Object

Returns the value of attribute errors.

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.actions ⇒ `Object`

Returns the value of attribute actions.



8
9
10

# File 'lib/dataduck/table.rb', line 8

def actions
  @actions
end

.output_schema ⇒ `Object`

Returns the value of attribute output_schema.



7
8
9

# File 'lib/dataduck/table.rb', line 7

def output_schema
  @output_schema
end

.sources ⇒ `Object`

Returns the value of attribute sources.



6
7
8

# File 'lib/dataduck/table.rb', line 6

def sources
  @sources
end

Instance Attribute Details

#data ⇒ `Object`

Returns the value of attribute data.



11
12
13

# File 'lib/dataduck/table.rb', line 11

def data
  @data
end

#errors ⇒ `Object`

Returns the value of attribute errors.



12
13
14

# File 'lib/dataduck/table.rb', line 12

def errors
  @errors
end

Class Method Details

.output(schema) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 42

def self.output(schema)
  self.output_schema ||= {}
  self.output_schema.merge!(schema)
end

.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 26

def self.source(source_name, source_table_or_query = nil, source_columns = nil)
  self.sources ||= []

  source_spec = {}
  if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
    source_spec = {query: source_table_or_query}
  elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
    source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
  else
    source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
  end

  source_spec[:source] = DataDuck::Source.source(source_name)
  self.sources << source_spec
end

.transforms(transformation_name) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 14

def self.transforms(transformation_name)
  self.actions ||= []
  self.actions << [:transform, transformation_name]
end

.validates(validation_name) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 20

def self.validates(validation_name)
  self.actions ||= []
  self.actions << [:validate, validation_name]
end

Instance Method Details

#actions ⇒ `Object`

# File 'lib/dataduck/table.rb', line 47

def actions
  my_actions = []
  for_class = self.class
  while for_class < Table
    my_actions.concat(for_class.actions || [])
    for_class = for_class.superclass
  end

  my_actions
end

#batch_size ⇒ `Object`



198
199
200

# File 'lib/dataduck/table.rb', line 198

def batch_size
  nil
end

#building_name ⇒ `Object`



218
219
220

# File 'lib/dataduck/table.rb', line 218

def building_name
  self.should_fully_reload? ? self.staging_name : self.name
end

#check_table_valid! ⇒ `Object`

# File 'lib/dataduck/table.rb', line 58

def check_table_valid!
  if !self.batch_size.nil?
    raise "Table #{ self.name }'s batch_size must be > 0" unless self.batch_size > 0
    raise "Table #{ self.name } has batch_size defined but no extract_by_column" if self.extract_by_column.nil?
  end
end

#distribution_key ⇒ `Object`

# File 'lib/dataduck/table.rb', line 65

def distribution_key
  if self.output_column_names.include?("id")
    "id"
  else
    nil
  end
end

#etl!(destinations, options = {}) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 73

def etl!(destinations, options = {})
  if destinations.length != 1
    raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.")
  end

  if options[:dates].nil?
    options[:dates] = [Date.today]
  end

  self.check_table_valid!

  destination = destinations.first

  if self.should_fully_reload?
    destination.drop_staging_table!(self)
  end

  data_processed = false
  batch_number = 0
  while batch_number < 1_000
    batch_number += 1
    self.extract!(destination, options)
    if self.data.length > 0
      self.transform!
      self.data.compact!
      self.load!(destination) if self.data.length > 0
      data_processed = true
    end

    if self.batch_size.nil?
      break
    else
      if self.batch_size == self.data.length
        DataDuck::Logs.info "Finished batch #{ batch_number }, continuing with the next batch"
      else
        DataDuck::Logs.info "Finished batch #{ batch_number } (last batch)"
        break
      end
    end
  end

  self.data = []

  if data_processed
    if self.should_fully_reload?
      destination.finish_fully_reloading_table!(self)
    end

    self.postprocess!(destination, options)
  else
    DataDuck::Logs.info "No data extracted for table #{ self.name }"
  end
end

#extract!(destination = nil, options = {}) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 127

def extract!(destination = nil, options = {})
  DataDuck::Logs.info "Extracting table #{ self.name }"

  self.errors ||= []
  self.data = []
  self.class.sources.each do |source_spec|
    source = source_spec[:source]
    my_query = self.extract_query(source_spec, destination)
    results = source.query(my_query)
    self.data.concat(results)
  end
  self.data
end

#extract_by_clause(value) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 167

def extract_by_clause(value)
  if value
    "WHERE #{ self.extract_by_column } >= '#{ value }'"
  else
    ""
  end
end

#extract_by_column ⇒ `Object`

# File 'lib/dataduck/table.rb', line 202

def extract_by_column
  return 'updated_at' if self.output_column_names.include?("updated_at")

  nil
end

#extract_query(source_spec, destination = nil) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 141

def extract_query(source_spec, destination = nil)
  escape_char = source_spec[:source].escape_char

  base_query = source_spec.has_key?(:query) ? source_spec[:query] :
     "SELECT #{ escape_char }#{ source_spec[:columns].sort.join(escape_char + ',' + escape_char) }#{ escape_char } FROM #{ source_spec[:table_name] }"

  extract_part = ""
  limit_part = self.limit_clause

  if self.extract_by_column
    if destination.table_names.include?(self.building_name)
      extract_by_column_without_table = self.extract_by_column.include?(".") ? self.extract_by_column.split(".").last : self.extract_by_column
      extract_by_value = destination.query("SELECT MAX(#{ extract_by_column_without_table }) AS val FROM #{ self.building_name }").first
      extract_by_value = extract_by_value.nil? ? nil : extract_by_value[:val]

      extract_part = self.extract_by_clause(extract_by_value)
    end
  end

  if base_query.downcase.split("from").last.include?(' where ')
    extract_part.gsub!('WHERE ', 'AND ')
  end

  [base_query, extract_part, limit_part].join(' ').strip
end

#identify_by_columns ⇒ `Object`

# File 'lib/dataduck/table.rb', line 208

def identify_by_columns
  return ["id"] if self.output_column_names.include?("id")

  []
end

#include_with_all? ⇒ `Boolean`

Returns:

(Boolean)



187
188
189

# File 'lib/dataduck/table.rb', line 187

def include_with_all?
  true
end

#indexes ⇒ `Object`

# File 'lib/dataduck/table.rb', line 191

def indexes
  which_columns = []
  which_columns << "id" if self.output_column_names.include?("id")
  which_columns << "created_at" if self.output_column_names.include?("created_at")
  which_columns
end

#limit_clause ⇒ `Object`

# File 'lib/dataduck/table.rb', line 175

def limit_clause
  if self.extract_by_column && self.batch_size
    "ORDER BY #{ self.extract_by_column } LIMIT #{ self.batch_size }"
  else
    ""
  end
end

#load!(destination) ⇒ `Object`



183
184
185

# File 'lib/dataduck/table.rb', line 183

def load!(destination)
  destination.load_table!(self)
end

#name ⇒ `Object`

# File 'lib/dataduck/table.rb', line 276

def name
  fixed_name = DataDuck::Util.camelcase_to_underscore(self.class.name)
  if fixed_name.start_with?("data_duck/")
    fixed_name = fixed_name.split("/").last
  end

  self.prefix + fixed_name
end

#output_column_names ⇒ `Object`



230
231
232

# File 'lib/dataduck/table.rb', line 230

def output_column_names
  self.output_schema.keys.sort.map(&:to_s)
end

#output_schema ⇒ `Object`



226
227
228

# File 'lib/dataduck/table.rb', line 226

def output_schema
  self.class.output_schema || self.class.superclass.output_schema || {}
end

#postprocess!(destination, options = {}) ⇒ `Object`



234
235
236

# File 'lib/dataduck/table.rb', line 234

def postprocess!(destination, options = {})
  destination.postprocess!(self)
end

#prefix ⇒ `Object`



285
286
287

# File 'lib/dataduck/table.rb', line 285

def prefix
  ""
end

#recreate!(destination) ⇒ `Object`



238
239
240

# File 'lib/dataduck/table.rb', line 238

def recreate!(destination)
  destination.recreate_table!(self)
end

#should_fully_reload? ⇒ `Boolean`

Returns:

(Boolean)



214
215
216

# File 'lib/dataduck/table.rb', line 214

def should_fully_reload?
  false # Set to true if you want to fully reload a table with each ETL
end

#show ⇒ `Object`

# File 'lib/dataduck/table.rb', line 242

def show
  puts "Table #{ self.name }"
  self.class.sources.each do |source_spec|
    puts "\nSources from #{ source_spec[:table_name] || source_spec[:query] } on #{ source_spec[:source].name }"
    source_spec[:columns].each do |col_name|
      puts "  #{ col_name }"
    end
  end

  puts "\nOutputs "
  num_separators = self.output_schema.keys.map { |key| key.length }.max
  self.output_schema.each_pair do |name, datatype|
    puts "  #{ name }#{ ' ' * (num_separators + 2 - name.length) }#{ datatype }"
  end
end

#staging_name ⇒ `Object`



222
223
224

# File 'lib/dataduck/table.rb', line 222

def staging_name
  "zz_dataduck_#{ self.name }"
end

#transform! ⇒ `Object`

# File 'lib/dataduck/table.rb', line 258

def transform!
  DataDuck::Logs.info "Transforming table #{ self.name }"

  self.errors ||= []
  self.actions.each do |action|
    action_type = action[0]
    action_method_name = action[1]
    if action_type == :transform
      self.data.map! { |row| self.public_send(action_method_name, row) }
    elsif action_type == :validate
      self.data.each do |row|
        error = self.public_send(action_method_name, row)
        self.errors << error if !error.blank?
      end
    end
  end
end

Class: DataDuck::Table

Direct Known Subclasses

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.actions ⇒ Object

.output_schema ⇒ Object

.sources ⇒ Object

Instance Attribute Details

#data ⇒ Object

#errors ⇒ Object

Class Method Details

.output(schema) ⇒ Object

.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ Object

.transforms(transformation_name) ⇒ Object

.validates(validation_name) ⇒ Object

Instance Method Details

#actions ⇒ Object

#batch_size ⇒ Object

#building_name ⇒ Object

#check_table_valid! ⇒ Object

#distribution_key ⇒ Object

#etl!(destinations, options = {}) ⇒ Object

#extract!(destination = nil, options = {}) ⇒ Object

#extract_by_clause(value) ⇒ Object

#extract_by_column ⇒ Object

#extract_query(source_spec, destination = nil) ⇒ Object

#identify_by_columns ⇒ Object

#include_with_all? ⇒ Boolean

#indexes ⇒ Object

#limit_clause ⇒ Object

#load!(destination) ⇒ Object

#name ⇒ Object

#output_column_names ⇒ Object

#output_schema ⇒ Object

#postprocess!(destination, options = {}) ⇒ Object

#prefix ⇒ Object

#recreate!(destination) ⇒ Object

#should_fully_reload? ⇒ Boolean

#show ⇒ Object

#staging_name ⇒ Object

#transform! ⇒ Object