Module: Datamancer
- Defined in:
- lib/datamancer.rb,
lib/datamancer/load.rb,
lib/datamancer/extract.rb,
lib/datamancer/version.rb,
lib/datamancer/transform.rb
Defined Under Namespace
Classes: ExistingField, MissingField
Constant Summary collapse
- VERSION =
"0.0.2"
Instance Method Summary collapse
- #aggregate(input) ⇒ Object
- #extract(args) ⇒ Object
- #field_actions(field, value, actions) ⇒ Object
- #join(left, right, attribute) ⇒ Object
- #keyvalue(file) ⇒ Object
- #load(input, args) ⇒ Object
- #transform(input, args = {}) ⇒ Object
Instance Method Details
#aggregate(input) ⇒ Object
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/datamancer/transform.rb', line 93 def aggregate input define_singleton_method :dim do |name| name = name.to_sym @dimensions[name] = @row[name] end define_singleton_method :fact do |name| name = name.to_sym @facts[name] = @row[name] end aggregated_input = Hash.new { |hash, key| hash[key] = Hash.new } input.each do |row| @row = row @dimensions = {} @facts = {} yield if block_given? aggregated_input[@dimensions].merge!(@facts) { |_, fact, other_fact| fact + other_fact } end aggregated_input.map do |dimensions, facts| dimensions.merge(facts) end end |
#extract(args) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/datamancer/extract.rb', line 3 def extract args raise ArgumentError, 'Extract requires a source, i.e. extract(from: source)' unless args.is_a?(Hash) && args[:from] headers = case args[:from] when String # TODO: The first row (headers row) is dropped; to drop more initial rows should be an option. # TODO: Test the separator option. csv = CSV.read args[:from], col_sep: (args[:separator] || ',') csv.shift when Hash ::ActiveRecord::Base.establish_connection args[:from] db = ::ActiveRecord::Base.connection # TODO: Test this. table = args[:table] || args[:from][:table] raise ArgumentError, 'Extract requires a database table, i.e. extract(from: source, table: table_name)' unless table db.columns(table).map(&:name) end @fields = {} @actions = {} headers.each do |header| field = header.to_sym mapping = header @fields[field] = mapping end unless args[:exclude] # The reason behind default_actions is the possibility to # write reject_if: nil with the DSL. default_actions = {reject_if: : |
#field_actions(field, value, actions) ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/datamancer/extract.rb', line 99 def field_actions field, value, actions return value unless actions # TODO: Revisit the order of actions. ## Block-passing ## # TODO: Test this. if actions[:block] value = actions[:block].call(value) end ## Stripping ## # TODO: Test this. if actions[:strip] value.strip! if value.respond_to?(:strip!) end ## Casting ## # Indexes and :type_default are not good friends. # (Because of join while transforming.) # TODO: Test this. if value || actions[:type_default] # TODO: Better data types support. From Mongoid: # [ ] Array # [ ] BigDecimal # [ ] Boolean # [x] Float # [ ] Hash # [x] Integer # [ ] Range # [ ] Regexp # [x] String # [x] Symbol # [ ] Date # [ ] DateTime # [ ] Time # [ ] TimeWithZone case actions[:type].to_s when 'Complex' value = value.to_c when 'Float' value = value.to_f when 'Integer' value = value.to_i when 'Rational' value = value.to_r when 'String' value = value.to_s when 'Symbol' value = value.to_sym end end ## Default value ## # TODO: Test this. if value.nil? || (actions[:empty_default] && value.empty?) value = actions[:default] end ## Validation ## # TODO: Test this. Test to not reject nil by default. if actions[:reject_if] == value || (actions[:reject_unless] != : |
#join(left, right, attribute) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/datamancer/transform.rb', line 3 def join left, right, attribute attribute = attribute.to_sym left_groups = Hash.new { |hash, key| hash[key] = [] } right_groups = Hash.new { |hash, key| hash[key] = [] } left.each do |tuple| left_groups[tuple[attribute]] << tuple if tuple[attribute] end right.each do |tuple| right_groups[tuple[attribute]] << tuple if tuple[attribute] end output = Array.new left_groups.each do |key, left_group| if right_group = right_groups[key] left_group.each do |left_tuple| right_group.each do |right_tuple| output << left_tuple.merge(right_tuple) end end end end output end |
#keyvalue(file) ⇒ Object
10 11 12 |
# File 'lib/datamancer.rb', line 10 def keyvalue file YAML::load_file file end |
#load(input, args) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/datamancer/load.rb', line 3 def load input, args raise ArgumentError, 'Load requires a destination, i.e. load(data, to: destination)' unless args.is_a?(Hash) && args[:to] ## COLUMNS ## # define_singleton_method :field do |name, options = {}| # @columns << (options[:map] || name) # end # @columns = [] # yield if block_given? ## INSERTS ## define_singleton_method :field do |name, = {}| name = name.to_sym raise MissingField, "Required field '#{name}' was not found in '#{args[:to]}'" unless @input_row.include? name @output_row[[:map] || name] = @input_row[name] @output_row.delete(name) if !args[:exclude] && [:map] end inserts = [] input.each do |row| @input_row = row @output_row = args[:exclude] ? {} : row.dup yield if block_given? inserts << @output_row.values end columns = @output_row.keys ## LOAD ## # TODO: Set 'w' or 'w+' for CSV writing. if args[:to].is_a?(String) mode = if args[:append] then 'a' else 'w' end CSV.open(args[:to], mode) do |csv| csv << columns inserts.each do |insert| csv << insert end end else ::ActiveRecord::Base.establish_connection(args[:to]) # TODO: Test this. table = args[:table] || args[:to][:table] raise ArgumentError, 'Load requires a database table, i.e. load(to: destination, table: table_name)' unless table ::ActiveRecord::Base.connection.delete("DELETE FROM #{table}") unless args[:append] batch_size = args[:batch] || 1000 pre_query = "INSERT INTO #{table} (#{columns.join(',')}) VALUES " # String values must be enclosed by single quotes. inserts.map! do |insert| insert.map! do |field| field.is_a?(String) ? "'#{field}'" : field end "(#{insert.join(',')})" end until inserts.empty? query = pre_query + inserts.pop(batch_size).join(',') ::ActiveRecord::Base.connection.execute query end end end |
#transform(input, args = {}) ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/datamancer/transform.rb', line 36 def transform input, args = {} if args[:join] raise ArgumentError unless args[:on] input = join input, args[:join], args[:on] end # TODO: Method-overriding safeguard. input.first.each_key do |key| define_singleton_method key.downcase do # Some methods applied to fields might modify the original fields. # Fields could be duplicated in case this be a common problem. #@input_row[key].dup @input_row[key] end end define_singleton_method :field do |name, value = nil, *args| raise MissingField, "Required field '#{name}' was not found" unless respond_to?(name) @output_row[name.to_sym] = if value.is_a?(Symbol) send(name).send *args.unshift(value) else value || send(name) end end define_singleton_method :del_field do |name| raise MissingField, "Filtered field '#{name}' was not found" unless respond_to?(name) @output_row.delete(name.to_sym) end define_singleton_method :new_field do |name, value| raise ExistingField, "New field '#{name}' already exists" if respond_to?(name) @output_row[name.to_sym] = value end input.map do |row| @input_row = row @output_row = args[:exclude] ? {} : @input_row.dup yield if block_given? @output_row end end |