Module: Datamancer

Defined in:
lib/datamancer.rb,
lib/datamancer/load.rb,
lib/datamancer/extract.rb,
lib/datamancer/version.rb,
lib/datamancer/transform.rb

Defined Under Namespace

Classes: ExistingField, MissingField

Constant Summary collapse

VERSION =
'0.1.0'

Instance Method Summary collapse

Instance Method Details

#add(left, right) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/datamancer/transform.rb', line 3

def add left, right
  first_row = left.first.merge right.first

  keys = first_row.keys

  valores_por_defecto = {}

  keys.each do |key|
    valores_por_defecto[key] = case first_row[key]
                               when String then ''
                               when Numeric then 0
                               else nil end
  end

  output = []

  (left + right).each do |input_row|

    output_row = {}

    keys.each do |key|
      output_row[key] = input_row[key] || valores_por_defecto[key]
    end

    output << output_row
  end

  output
end

#aggregate(input) ⇒ Object



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/datamancer/transform.rb', line 195

def aggregate input

  define_singleton_method :dim do |name|
    name = name.to_sym
    @dimensions[name] = @row[name]
  end
  
  define_singleton_method :fact do |name|
    name = name.to_sym
    @facts[name] = @row[name]
  end

  aggregated_input = Hash.new { |hash, key| hash[key] = Hash.new }

  input.each_with_index do |row, row_number|
    @row = row
    @dimensions = {}
    @facts = {}

    yield if block_given?

    aggregated_input[@dimensions].merge!(@facts) { |_, fact, other_fact| fact + other_fact }
  end

  aggregated_input.map do |dimensions, facts|
    dimensions.merge(facts)
  end
end

#extract(args) ⇒ Object

Raises:

  • (ArgumentError)


3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/datamancer/extract.rb', line 3

def extract args
  raise ArgumentError,
    'Extract requires a source, i.e. extract(from: source)' unless
      args.is_a?(Hash) && args[:from]

  headers = case args[:from]
  when String

    # TODO: The first row (headers row) is dropped; to drop more initial rows should be an option.
   
    # TODO: Test the separator option.

    csv = CSV.read args[:from], col_sep: (args[:separator] || ',')
    csv.shift
  when Hash
    ::ActiveRecord::Base.establish_connection args[:from]
    db = ::ActiveRecord::Base.connection

    # TODO: Test this.

    table = args[:table] || args[:from][:table]
    
    raise ArgumentError,
      'Extract requires a database table, i.e. extract(from: source, table: table_name)' unless table

    db.columns(table).map(&:name)
  end

  @fields  = {}
  @actions = {}
  
  headers.each do |header|
    field   = header.to_sym
    mapping = header
    @fields[field] = mapping
  end unless args[:exclude]

  # The reason behind default_actions is the possibility to
  # write reject_if: nil with the DSL.
  default_actions = {reject_if: :nil, reject_unless: :nil}

  define_singleton_method :field do |name, actions = {}, &block|
    actions[:type] ||= actions[:type_default]
    actions[:default] ||= actions[:empty_default]
    actions = default_actions.merge(actions)
    mapping = actions[:map] || name.to_s

    raise MissingField,
      "Required field '#{mapping}' was not found in '#{args[:from]}'" unless headers.include? mapping
    
    field = name.to_sym
    @fields.delete(actions[:map].to_sym) if actions[:map]
    @fields[field]  = mapping
    @actions[field] = actions
    @actions[field][:block] = block
  end

  yield if block_given?
  
  output = case args[:from]
  when String
    @fields.each do |field, mapping|
      index = headers.find_index(mapping)
      @fields[field] = index
    end

    csv

  when Hash

    #TODO: Test for column names with spaces.
    #TODO: Implement all the SQL escaping cases.

    columns = @fields.map { |field, mapping| "[#{mapping}] AS [#{field}]" }.join(', ')

    @fields.keys.each_with_index do |field, index|
      @fields[field] = index
    end

    # TODO: Test top, distinct, where.
    # TODO: Top for CSV.
    # TODO: Top for support several databases.

    if args[:distinct]
      db.select_rows(
        "SELECT DISTINCT #{columns} FROM #{table}")
    elsif args[:top]
      db.select_rows(
        "SELECT TOP #{args[:top]} #{columns} FROM #{table} #{'WHERE ' + args[:where] if args[:where]}")
    else
      db.select_rows(
        "SELECT #{columns} FROM #{table} #{'WHERE ' + args[:where] if args[:where]}")
    end
  end

  output.map! do |array_row|
    hash_row = {}

    @fields.each do |field, index|
      value = array_row[index]
      hash_row[field] = field_actions field, value, @actions[field]
    end

    if hash_row.has_value?(:reject)
      nil
    else
      hash_row
    end
  end.compact!

  output
end

#field_actions(field, value, actions) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/datamancer/extract.rb', line 116

def field_actions field, value, actions
  return value unless actions

  # TODO: Revisit the order of actions.

  ## Block-passing ##
  
  # TODO: Test this.

  if actions[:block]
    value = actions[:block].call(value)
  end

  ## Stripping ##

  # TODO: Test this.

  if actions[:strip]
    value.strip! if value.respond_to?(:strip!)
  end

  ## Casting ##

  # Indexes and :type_default are not good friends.
  # (Because of join while transforming.)

  # TODO: Test this.

  if value || actions[:type_default]

    # TODO: Better data types support. From Mongoid:

    # [ ] Array
    # [ ] BigDecimal
    # [ ] Boolean
    # [x] Float
    # [ ] Hash
    # [x] Integer
    # [ ] Range
    # [ ] Regexp
    # [x] String
    # [x] Symbol
    # [x] Date
    # [ ] DateTime
    # [ ] Time
    # [ ] TimeWithZone

    case actions[:type].to_s
    when 'Complex'
      value = value.to_c
    when 'Float'
      value = value.to_f
    when 'Integer'
      value = value.to_i
    when 'Rational'
      value = value.to_r
    when 'String'
      value = value.to_s
    when 'Symbol'
      value = value.to_sym
    when 'Date'
      # From "1900-01-01 00:00:00.000" to "19000101".
      value = value.gsub('-', '')[0..7] 
    end
  end

  ## Default value ##

  # TODO: Test this.

  if value.nil? || (actions[:empty_default] && value.empty?)
    value = actions[:default]
  end

  ## Validation ##

  # TODO: Test this. Test to not reject nil by default.

  if actions[:reject_if].is_a? Array
    actions[:reject_if].each do |reject_value|
      value = :reject if reject_value == value
    end
  else
    value = :reject if actions[:reject_if] == value
  end

  if actions[:reject_unless].is_a? Array
    actions[:reject_unless].each do |reject_value|
      value = :reject if reject_value != value
    end
  else
    value = :reject if actions[:reject_unless] != :nil &&
                       actions[:reject_unless] != value
  end

  value
end

#join(left, right, attribute) ⇒ Object

Raises:

  • (StandardError)


33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/datamancer/transform.rb', line 33

def join left, right, attribute

  attribute = attribute.to_sym

  left_groups  = Hash.new { |hash, key| hash[key] = [] }
  right_groups = Hash.new { |hash, key| hash[key] = [] }

  left.each do |tuple|
    left_groups[tuple[attribute]] << tuple if tuple[attribute]
  end
  
  right.each do |tuple|
    right_groups[tuple[attribute]] << tuple if tuple[attribute]
  end

  output = Array.new
 
  left_groups.each do |key, left_group|
    
    if right_group = right_groups[key]

      left_group.each do |left_tuple|
        right_group.each do |right_tuple|
          output << left_tuple.merge(right_tuple)
        end
      end

    end
  end

  # TODO: Test this:

  raise StandardError, 'Sadness: null join.' if output.empty?

  output
end

#keyvalue(file) ⇒ Object



11
12
13
# File 'lib/datamancer.rb', line 11

def keyvalue file
  YAML::load_file file
end

#load(input, args) ⇒ Object

Raises:

  • (ArgumentError)


48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/datamancer/load.rb', line 48

def load input, args

  raise ArgumentError,
    'Load requires a destination, i.e. load(data, to: destination)' unless
      args.is_a?(Hash) && args[:to]

  ## COLUMNS ##

  # define_singleton_method :field do |name, options = {}|
  #   @columns << (options[:map] || name)
  # end

  # @columns = []

  # yield if block_given?

  ## INSERTS ##

  define_singleton_method :field do |name, options = {}|
    name = name.to_sym

    raise MissingField,
      "Required field '#{name}' was not found in '#{args[:to]}'" unless @input_row.include? name

    @output_row[options[:map] || name] = @input_row[name]
    @output_row.delete(name) if !args[:exclude] && options[:map]
  end
 
  inserts = []

  input.each do |row|
    @input_row = row
    @output_row = args[:exclude] ? {} : row.dup

    yield if block_given?

    inserts << @output_row.values
  end

  columns = @output_row.keys

  ## LOAD ##

  # TODO: Set 'w' or 'w+' for CSV writing.
  
  if args[:to].is_a?(String)
    mode = if args[:append] then 'a' else 'w' end
    
    CSV.open(args[:to], mode) do |csv|
      csv << columns

      inserts.each do |insert|
        csv << insert
      end
    end

  else
    ::ActiveRecord::Base.establish_connection(args[:to])

    # TODO: Test this.

    table = args[:table] || args[:to][:table]
    
    raise ArgumentError,
      'Load requires a database table, i.e. load(data, to: destination, table: table_name)' unless table

    ::ActiveRecord::Base.connection.delete("DELETE FROM #{table}") unless args[:append]
    batch_size = args[:batch] || 1000

    pre_query = "INSERT INTO #{table} (#{columns.join(',')}) VALUES "

    # String values must be enclosed by single quotes.
    inserts.map! do |insert|
      insert.map! do |field|
        field.is_a?(String) ? "'#{field}'" : (field ? field : 'NULL')
      end

      "(#{insert.join(',')})"
    end

    until inserts.empty?
      query = pre_query + inserts.pop(batch_size).join(',')
      ::ActiveRecord::Base.connection.execute query
    end
  end
end

#raw(input, args) ⇒ Object

Raises:

  • (ArgumentError)


3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/datamancer/load.rb', line 3

def raw input, args

  raise ArgumentError,
    'Raw requires a destination, i.e. raw(data, to: destination)' unless
      args.is_a?(Hash) && args[:to]

  ::ActiveRecord::Base.establish_connection(args[:to])

  # TODO: Test this.

  # table = args[:table] || args[:to][:table]
    
  # raise ArgumentError,
  #   'Raw requires a database table, i.e. raw(data, to: destination, table: table_name)' unless table

  # TODO: Method-overriding safeguard.

  input.first.each_key do |key|
    define_singleton_method key.downcase do
    
      # Some methods applied to fields might modify the original fields.
      # Fields could be duplicated in case this be a common problem.

      #@input_row[key].dup

      @input_row[key]
    end
  end

  define_singleton_method :db do
    ::ActiveRecord::Base.connection
  end
  
  define_singleton_method :query do |query|
    ::ActiveRecord::Base.connection.execute query
  end

  input.each do |row|
    @input_row = row
    
    yield if block_given?
  end
end

#transform(input, args = {}) ⇒ Object



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/datamancer/transform.rb', line 90

def transform input, args = {}

  # TODO: Mensajes que expliquen mejor los estos errores.

  if args[:join]
    raise ArgumentError unless args[:on]
    raise ArgumentError unless input.first.keys.include?(args[:on].to_sym)
    raise ArgumentError unless args[:join].first.keys.include?(args[:on].to_sym)

    input = join input, args[:join], args[:on]
  end

  if args[:add]
    input = add input, args[:add]
  end

  if args[:unique]
    input = unique input, args[:unique]
  end

  # TODO: Method-overriding safeguard.

  input.first.each_key do |key|
    define_singleton_method key.to_s.gsub(' ', '_').downcase do
    
      # Some methods applied to fields might modify the original fields.
      # Fields could be duplicated in case this be a common problem.

      #@input_row[key].dup
      
      @input_row[key]
    end
  end

  define_singleton_method :row_number do
    @row_number
  end

  define_singleton_method :row do
    @supplementary_row
  end

  define_singleton_method :count do
    @count += 1
  end

  define_singleton_method :output do
    @output
  end

  define_singleton_method :switch do |slot|
    @slot = slot
  end

  define_singleton_method :field do |name, value = nil, *args|
    raise MissingField,
      "Required field '#{name}' was not found" unless @input_row.include?(name.to_sym)

    @output_row[name.to_sym] = if value.is_a?(Symbol)
                                 send(name.downcase).send *args.unshift(value)
                               else
                                 value || send(name.downcase)
                               end
  end

  define_singleton_method :del_field do |name|
    raise MissingField,
      "Filtered field '#{name}' was not found" unless @input_row.include?(name.to_sym)
    
    @output_row.delete(name.to_sym)
  end

  define_singleton_method :new_field do |name, value|
    raise ExistingField,
      "New field '#{name}' already exists" if @input_row.include?(name.to_sym)

    @output_row[name.to_sym] = value
  end

  # TODO: Test for count.

  @count = 0
  
  # TODO: Test for slots.
  
  @output = Hash.new { |h, k| h[k] = [] }

  input.each_with_index do |row, row_number|

    # TODO: Test for row_number.
    # TODO: Test for (supplementary) row.

    @row_number = row_number
    @input_row = row
    @supplementary_row = @input_row.dup
    @output_row = args[:exclude] ? {} : @input_row.dup

    yield if block_given?

    @output[@slot] << @output_row
  end

  @output.length == 1? @output[nil] : @output
end

#unique(input, attribute) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/datamancer/transform.rb', line 74

def unique input, attribute

  attribute = attribute.to_sym
  output = Array.new
  unique_values = Array.new

  input.each do |row|
    unless unique_values.include?(row[attribute])
      output << row
      unique_values << row[attribute]
    end
  end

  output
end

#where(input, attributes) ⇒ Object



70
71
72
# File 'lib/datamancer/transform.rb', line 70

def where input, attributes

end