Class: DataDuck::Table

Inherits:

Object

Object
DataDuck::Table

show all

Defined in:: lib/dataduck/table.rb

Class Attribute Summary collapse

.actions ⇒ Object

Returns the value of attribute actions.
.output_schema ⇒ Object

Returns the value of attribute output_schema.
.sources ⇒ Object

Returns the value of attribute sources.

Instance Attribute Summary collapse

#data ⇒ Object

Returns the value of attribute data.
#errors ⇒ Object

Returns the value of attribute errors.

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.actions ⇒ `Object`

Returns the value of attribute actions.



8
9
10

# File 'lib/dataduck/table.rb', line 8

def actions
  @actions
end

.output_schema ⇒ `Object`

Returns the value of attribute output_schema.



7
8
9

# File 'lib/dataduck/table.rb', line 7

def output_schema
  @output_schema
end

.sources ⇒ `Object`

Returns the value of attribute sources.



6
7
8

# File 'lib/dataduck/table.rb', line 6

def sources
  @sources
end

Instance Attribute Details

#data ⇒ `Object`

Returns the value of attribute data.



11
12
13

# File 'lib/dataduck/table.rb', line 11

def data
  @data
end

#errors ⇒ `Object`

Returns the value of attribute errors.



12
13
14

# File 'lib/dataduck/table.rb', line 12

def errors
  @errors
end

Class Method Details

.output(schema) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 42

def self.output(schema)
  self.output_schema ||= {}
  self.output_schema.merge!(schema)
end

.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 26

def self.source(source_name, source_table_or_query = nil, source_columns = nil)
  self.sources ||= []

  source_spec = {}
  if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
    source_spec = {query: source_table_or_query}
  elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
    source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
  else
    source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
  end

  source_spec[:source] = DataDuck::Source.source(source_name)
  self.sources << source_spec
end

.transforms(transformation_name) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 14

def self.transforms(transformation_name)
  self.actions ||= []
  self.actions << [:transform, transformation_name]
end

.validates(validation_name) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 20

def self.validates(validation_name)
  self.actions ||= []
  self.actions << [:validate, validation_name]
end

Instance Method Details

#actions ⇒ `Object`



47
48
49

# File 'lib/dataduck/table.rb', line 47

def actions
  self.class.actions
end

#batch_size ⇒ `Object`



166
167
168

# File 'lib/dataduck/table.rb', line 166

def batch_size
  nil
end

#building_name ⇒ `Object`



180
181
182

# File 'lib/dataduck/table.rb', line 180

def building_name
  self.should_fully_reload? ? self.staging_name : self.name
end

#check_table_valid! ⇒ `Object`

# File 'lib/dataduck/table.rb', line 51

def check_table_valid!
  if !self.batch_size.nil?
    raise Exception.new("Table #{ self.name }'s batch_size must be > 0") unless self.batch_size > 0
    raise Exception.new("Table #{ self.name } has batch_size defined but no extract_by_column") if self.extract_by_column.nil?
  end
end

#distribution_key ⇒ `Object`

# File 'lib/dataduck/table.rb', line 58

def distribution_key
  if self.output_column_names.include?("id")
    "id"
  else
    nil
  end
end

#etl!(destinations) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 66

def etl!(destinations)
  if destinations.length != 1
    raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.")
  end
  self.check_table_valid!
  destination = destinations.first

  if self.should_fully_reload?
    destination.drop_staging_table!(self)
  end

  batch_number = 0
  while batch_number < 1_000
    batch_number += 1
    self.extract!(destination)
    self.transform!
    self.load!(destination)

    if self.batch_size.nil?
      break
    else
      if self.batch_size == self.data.length
        DataDuck::Logs.info "Finished batch #{ batch_number }, continuing with the next batch"
      else
        DataDuck::Logs.info "Finished batch #{ batch_number } (last batch)"
        break
      end
    end
  end

  self.data = []

  if self.should_fully_reload?
    destination.finish_fully_reloading_table!(self)
  end
end

#extract!(destination = nil) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 103

def extract!(destination = nil)
  DataDuck::Logs.info "Extracting table #{ self.name }"

  self.errors ||= []
  self.data = []
  self.class.sources.each do |source_spec|
    source = source_spec[:source]
    my_query = self.extract_query(source_spec, destination)
    results = source.query(my_query)
    self.data = results
  end
  self.data
end

#extract_by_clause(value) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 139

def extract_by_clause(value)
  if value
    "WHERE #{ self.extract_by_column } >= '#{ value }'"
  else
    ""
  end
end

#extract_by_column ⇒ `Object`

# File 'lib/dataduck/table.rb', line 170

def extract_by_column
  return 'updated_at' if self.output_column_names.include?("updated_at")

  nil
end

#extract_query(source_spec, destination = nil) ⇒ `Object`

# File 'lib/dataduck/table.rb', line 117

def extract_query(source_spec, destination = nil)
  escape_char = source_spec[:source].escape_char

  base_query = source_spec.has_key?(:query) ? source_spec[:query] :
     "SELECT #{ escape_char }#{ source_spec[:columns].sort.join(escape_char + ',' + escape_char) }#{ escape_char } FROM #{ source_spec[:table_name] }"

  extract_part = ""
  limit_part = self.limit_clause

  if self.extract_by_column
    if destination.table_names.include?(self.building_name)
      extract_by_column_without_table = self.extract_by_column.include?(".") ? self.extract_by_column.split(".").last : self.extract_by_column
      extract_by_value = destination.query("SELECT MAX(#{ extract_by_column_without_table }) AS val FROM #{ self.building_name }").first
      extract_by_value = extract_by_value.nil? ? nil : extract_by_value[:val]

      extract_part = self.extract_by_clause(extract_by_value)
    end
  end

  [base_query, extract_part, limit_part].join(' ').strip
end

#indexes ⇒ `Object`

# File 'lib/dataduck/table.rb', line 159

def indexes
  which_columns = []
  which_columns << "id" if self.output_column_names.include?("id")
  which_columns << "created_at" if self.output_column_names.include?("created_at")
  which_columns
end

#limit_clause ⇒ `Object`

# File 'lib/dataduck/table.rb', line 147

def limit_clause
  if self.extract_by_column && self.batch_size
    "ORDER BY #{ self.extract_by_column } LIMIT #{ self.batch_size }"
  else
    ""
  end
end

#load!(destination) ⇒ `Object`



155
156
157

# File 'lib/dataduck/table.rb', line 155

def load!(destination)
  destination.load_table!(self)
end

#name ⇒ `Object`



235
236
237

# File 'lib/dataduck/table.rb', line 235

def name
  DataDuck::Util.camelcase_to_underscore(self.class.name)
end

#output_column_names ⇒ `Object`



192
193
194

# File 'lib/dataduck/table.rb', line 192

def output_column_names
  self.output_schema.keys.sort.map(&:to_s)
end

#output_schema ⇒ `Object`



188
189
190

# File 'lib/dataduck/table.rb', line 188

def output_schema
  self.class.output_schema || {}
end

#recreate!(destination) ⇒ `Object`



196
197
198

# File 'lib/dataduck/table.rb', line 196

def recreate!(destination)
  destination.recreate_table!(self)
end

#should_fully_reload? ⇒ `Boolean`

Returns:

(Boolean)



176
177
178

# File 'lib/dataduck/table.rb', line 176

def should_fully_reload?
  false # Set to true if you want to fully reload a table with each ETL
end

#show ⇒ `Object`

# File 'lib/dataduck/table.rb', line 200

def show
  puts "Table #{ self.name }"
  self.class.sources.each do |source_spec|
    puts "\nSources from #{ source_spec[:table_name] || source_spec[:query] } on #{ source_spec[:source].name }"
    source_spec[:columns].each do |col_name|
      puts "  #{ col_name }"
    end
  end

  puts "\nOutputs "
  num_separators = self.output_schema.keys.map { |key| key.length }.max
  self.output_schema.each_pair do |name, datatype|
    puts "  #{ name }#{ ' ' * (num_separators + 2 - name.length) }#{ datatype }"
  end
end

#staging_name ⇒ `Object`



184
185
186

# File 'lib/dataduck/table.rb', line 184

def staging_name
  "zz_dataduck_#{ self.name }"
end

#transform! ⇒ `Object`

# File 'lib/dataduck/table.rb', line 216

def transform!
  DataDuck::Logs.info "Transforming table #{ self.name }"

  self.errors ||= []
  self.class.actions ||= []
  self.class.actions.each do |action|
    action_type = action[0]
    action_method_name = action[1]
    if action_type == :transform
      self.data.map! { |row| self.public_send(action_method_name, row) }
    elsif action_type == :validate
      self.data.each do |row|
        error = self.public_send(action_method_name, row)
        self.errors << error if !error.blank?
      end
    end
  end
end

Class: DataDuck::Table

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.actions ⇒ Object

.output_schema ⇒ Object

.sources ⇒ Object

Instance Attribute Details

#data ⇒ Object

#errors ⇒ Object

Class Method Details

.output(schema) ⇒ Object

.source(source_name, source_table_or_query = nil, source_columns = nil) ⇒ Object

.transforms(transformation_name) ⇒ Object

.validates(validation_name) ⇒ Object

Instance Method Details

#actions ⇒ Object

#batch_size ⇒ Object

#building_name ⇒ Object

#check_table_valid! ⇒ Object

#distribution_key ⇒ Object

#etl!(destinations) ⇒ Object

#extract!(destination = nil) ⇒ Object

#extract_by_clause(value) ⇒ Object

#extract_by_column ⇒ Object

#extract_query(source_spec, destination = nil) ⇒ Object

#indexes ⇒ Object

#limit_clause ⇒ Object

#load!(destination) ⇒ Object

#name ⇒ Object

#output_column_names ⇒ Object

#output_schema ⇒ Object

#recreate!(destination) ⇒ Object

#should_fully_reload? ⇒ Boolean

#show ⇒ Object

#staging_name ⇒ Object

#transform! ⇒ Object