Module: Datamancer

Defined in:
lib/datamancer.rb,
lib/datamancer/load.rb,
lib/datamancer/extract.rb,
lib/datamancer/version.rb,
lib/datamancer/transform.rb

Defined Under Namespace

Classes: ExistingField, MissingField

Constant Summary collapse

VERSION =
'0.0.4'

Instance Method Summary collapse

Instance Method Details

#aggregate(input) ⇒ Object



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/datamancer/transform.rb', line 159

def aggregate input

  define_singleton_method :dim do |name|
    name = name.to_sym
    @dimensions[name] = @row[name]
  end
  
  define_singleton_method :fact do |name|
    name = name.to_sym
    @facts[name] = @row[name]
  end

  aggregated_input = Hash.new { |hash, key| hash[key] = Hash.new }

  input.each_with_index do |row, row_number|
    @row = row
    @dimensions = {}
    @facts = {}

    yield if block_given?

    aggregated_input[@dimensions].merge!(@facts) { |_, fact, other_fact| fact + other_fact }
  end

  aggregated_input.map do |dimensions, facts|
    dimensions.merge(facts)
  end
end

#extract(args) ⇒ Object

Raises:

  • (ArgumentError)


3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/datamancer/extract.rb', line 3

def extract args
  raise ArgumentError,
    'Extract requires a source, i.e. extract(from: source)' unless
      args.is_a?(Hash) && args[:from]

  headers = case args[:from]
  when String

    # TODO: The first row (headers row) is dropped; to drop more initial rows should be an option.
   
    # TODO: Test the separator option.

    csv = CSV.read args[:from], col_sep: (args[:separator] || ',')
    csv.shift
  when Hash
    ::ActiveRecord::Base.establish_connection args[:from]
    db = ::ActiveRecord::Base.connection

    # TODO: Test this.

    table = args[:table] || args[:from][:table]
    
    raise ArgumentError,
      'Extract requires a database table, i.e. extract(from: source, table: table_name)' unless table

    db.columns(table).map(&:name)
  end

  @fields  = {}
  @actions = {}
  
  headers.each do |header|
    field   = header.to_sym
    mapping = header
    @fields[field] = mapping
  end unless args[:exclude]

  # The reason behind default_actions is the possibility to
  # write reject_if: nil with the DSL.
  default_actions = {reject_if: :ñil, reject_unless: :ñil}

  define_singleton_method :field do |name, actions = {}, &block|
    actions[:type] ||= actions[:type_default]
    actions[:default] ||= actions[:empty_default]
    actions = default_actions.merge(actions)
    mapping = actions[:map] || name.to_s

    raise MissingField,
      "Required field '#{mapping}' was not found in '#{args[:from]}'" unless headers.include? mapping
    
    field = name.to_sym
    @fields.delete(actions[:map].to_sym) if actions[:map]
    @fields[field]  = mapping
    @actions[field] = actions
    @actions[field][:block] = block
  end

  yield if block_given?
  
  output = case args[:from]
  when String
    @fields.each do |field, mapping|
      index = headers.find_index(mapping)
      @fields[field] = index
    end

    csv

  when Hash
    columns = @fields.map { |field, mapping| "#{mapping} AS #{field}" }.join(', ')

    @fields.keys.each_with_index do |field, index|
      @fields[field] = index
    end

    # TODO: Test top.
    # TODO: Top for CSV.
    # TODO: Top for support several databases.

    if args[:top]
      db.select_rows("SELECT TOP #{args[:top]} #{columns} FROM #{table}" )
    else
      db.select_rows("SELECT #{columns} FROM #{table}")
    end
  end

  output.map! do |array_row|
    hash_row = {}

    @fields.each do |field, index|
      value = array_row[index]
      hash_row[field] = field_actions field, value, @actions[field]
    end

    if hash_row.has_value?(:reject)
      nil
    else
      hash_row
    end
  end.compact!

  output
end

#field_actions(field, value, actions) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/datamancer/extract.rb', line 107

def field_actions field, value, actions
  return value unless actions

  # TODO: Revisit the order of actions.

  ## Block-passing ##
  
  # TODO: Test this.

  if actions[:block]
    value = actions[:block].call(value)
  end

  ## Stripping ##

  # TODO: Test this.

  if actions[:strip]
    value.strip! if value.respond_to?(:strip!)
  end

  ## Casting ##

  # Indexes and :type_default are not good friends.
  # (Because of join while transforming.)

  # TODO: Test this.

  if value || actions[:type_default]

    # TODO: Better data types support. From Mongoid:

    # [ ] Array
    # [ ] BigDecimal
    # [ ] Boolean
    # [x] Float
    # [ ] Hash
    # [x] Integer
    # [ ] Range
    # [ ] Regexp
    # [x] String
    # [x] Symbol
    # [x] Date
    # [ ] DateTime
    # [ ] Time
    # [ ] TimeWithZone

    case actions[:type].to_s
    when 'Complex'
      value = value.to_c
    when 'Float'
      value = value.to_f
    when 'Integer'
      value = value.to_i
    when 'Rational'
      value = value.to_r
    when 'String'
      value = value.to_s
    when 'Symbol'
      value = value.to_sym
    when 'Date'
      # From "1900-01-01 00:00:00.000" to "19000101".
      value = value.gsub('-', '')[0..7] 
    end
  end

  ## Default value ##

  # TODO: Test this.

  if value.nil? || (actions[:empty_default] && value.empty?)
    value = actions[:default]
  end

  ## Validation ##

  # TODO: Test this. Test to not reject nil by default.

  if actions[:reject_if].is_a? Array
    actions[:reject_if].each do |reject_value|
      value = :reject if reject_value == value
    end
  else
    value = :reject if actions[:reject_if] == value
  end

  if actions[:reject_unless].is_a? Array
    actions[:reject_unless].each do |reject_value|
      value = :reject if reject_value != value
    end
  else
    value = :reject if actions[:reject_unless] != :ñil &&
                       actions[:reject_unless] != value
  end

  value
end

#join(left, right, attribute) ⇒ Object

Raises:

  • (StandardError)


3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/datamancer/transform.rb', line 3

def join left, right, attribute

  attribute = attribute.to_sym

  left_groups  = Hash.new { |hash, key| hash[key] = [] }
  right_groups = Hash.new { |hash, key| hash[key] = [] }

  left.each do |tuple|
    left_groups[tuple[attribute]] << tuple if tuple[attribute]
  end
  
  right.each do |tuple|
    right_groups[tuple[attribute]] << tuple if tuple[attribute]
  end

  output = Array.new
 
  left_groups.each do |key, left_group|
    
    if right_group = right_groups[key]

      left_group.each do |left_tuple|
        right_group.each do |right_tuple|
          output << left_tuple.merge(right_tuple)
        end
      end

    end
  end

  # TODO: Test this:

  raise StandardError, 'Sadness: null join.' if output.empty?

  output
end

#keyvalue(file) ⇒ Object



10
11
12
# File 'lib/datamancer.rb', line 10

def keyvalue file
  YAML::load_file file
end

#load(input, args) ⇒ Object

Raises:

  • (ArgumentError)


3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/datamancer/load.rb', line 3

def load input, args

  raise ArgumentError,
    'Load requires a destination, i.e. load(data, to: destination)' unless
      args.is_a?(Hash) && args[:to]

  ## COLUMNS ##

  # define_singleton_method :field do |name, options = {}|
  #   @columns << (options[:map] || name)
  # end

  # @columns = []

  # yield if block_given?

  ## INSERTS ##

  define_singleton_method :field do |name, options = {}|
    name = name.to_sym

    raise MissingField,
      "Required field '#{name}' was not found in '#{args[:to]}'" unless @input_row.include? name

    @output_row[options[:map] || name] = @input_row[name]
    @output_row.delete(name) if !args[:exclude] && options[:map]
  end
 
  inserts = []

  input.each do |row|
    @input_row = row
    @output_row = args[:exclude] ? {} : row.dup

    yield if block_given?

    inserts << @output_row.values
  end

  columns = @output_row.keys

  ## LOAD ##

  # TODO: Set 'w' or 'w+' for CSV writing.
  
  if args[:to].is_a?(String)
    mode = if args[:append] then 'a' else 'w' end
    
    CSV.open(args[:to], mode) do |csv|
      csv << columns

      inserts.each do |insert|
        csv << insert
      end
    end

  else
    ::ActiveRecord::Base.establish_connection(args[:to])

    # TODO: Test this.

    table = args[:table] || args[:to][:table]
    
    raise ArgumentError,
      'Load requires a database table, i.e. load(to: destination, table: table_name)' unless table

    ::ActiveRecord::Base.connection.delete("DELETE FROM #{table}") unless args[:append]
    batch_size = args[:batch] || 1000

    pre_query = "INSERT INTO #{table} (#{columns.join(',')}) VALUES "

    # String values must be enclosed by single quotes.
    inserts.map! do |insert|
      insert.map! do |field|
        field.is_a?(String) ? "'#{field}'" : (field ? field : 'NULL')
      end

      "(#{insert.join(',')})"
    end

    until inserts.empty?
      query = pre_query + inserts.pop(batch_size).join(',')
      ::ActiveRecord::Base.connection.execute query
    end
  end
end

#transform(input, args = {}) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/datamancer/transform.rb', line 60

def transform input, args = {}

  if args[:join]
    raise ArgumentError unless args[:on]
    raise ArgumentError unless input.first.keys.include?(args[:on].to_sym)
    raise ArgumentError unless args[:join].first.keys.include?(args[:on].to_sym)

    input = join input, args[:join], args[:on]
  end

  if args[:unique]
    input = unique input, args[:unique]
  end

  # TODO: Method-overriding safeguard.

  input.first.each_key do |key|
    define_singleton_method key.downcase do
    
      # Some methods applied to fields might modify the original fields.
      # Fields could be duplicated in case this be a common problem.

      #@input_row[key].dup

      @input_row[key]
    end
  end

  define_singleton_method :row_number do
    @row_number
  end

  define_singleton_method :row do
    @supplementary_row
  end

  define_singleton_method :count do
    @count += 1
  end

  define_singleton_method :output do
    @output
  end

  define_singleton_method :switch do |slot|
    @slot = slot
  end

  define_singleton_method :field do |name, value = nil, *args|
    raise MissingField,
      "Required field '#{name}' was not found" unless @input_row.include?(name.to_sym)

    @output_row[name.to_sym] = if value.is_a?(Symbol)
                                 send(name.downcase).send *args.unshift(value)
                               else
                                 value || send(name.downcase)
                               end
  end

  define_singleton_method :del_field do |name|
    raise MissingField,
      "Filtered field '#{name}' was not found" unless @input_row.include?(name.to_sym)
    
    @output_row.delete(name.to_sym)
  end

  define_singleton_method :new_field do |name, value|
    raise ExistingField,
      "New field '#{name}' already exists" if @input_row.include?(name.to_sym)

    @output_row[name.to_sym] = value
  end

  # TODO: Test for count.

  @count = 0
  
  # TODO: Test for slots.
  
  @output = Hash.new { |h, k| h[k] = [] }

  input.each_with_index do |row, row_number|

    # TODO: Test for row_number.
    # TODO: Test for (supplementary) row.

    @row_number = row_number
    @input_row = row
    @supplementary_row = @input_row.dup
    @output_row = args[:exclude] ? {} : @input_row.dup

    yield if block_given?

    @output[@slot] << @output_row
  end

  @output.length == 1? @output[nil] : @output
end

#unique(input, attribute) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/datamancer/transform.rb', line 44

def unique input, attribute

  attribute = attribute.to_sym
  output = Array.new
  unique_values = Array.new

  input.each do |row|
    unless unique_values.include?(row[attribute])
      output << row
      unique_values << row[attribute]
    end
  end

  output
end

#where(input, attributes) ⇒ Object



40
41
42
# File 'lib/datamancer/transform.rb', line 40

def where input, attributes

end