Module: DataTools::ArrayOfHashes
- Defined in:
- lib/data_tools/array_of_hashes.rb
Instance Method Summary collapse
-
#allkeys ⇒ Object
What different keys appear in this collection of hashes?.
-
#coalesce ⇒ Object
combine a set of hashes into one for each key, find all the distinct values from all the hashes if there’s one unique value, store the single value in key of the result if there are multiple values, store them all as an array.
-
#count_off!(key = :key, start = 0) ⇒ Object
assign unique IDs to every hash in the array argument is the name of the field to use for the generated sequential key.
-
#dumpme(filename) ⇒ Object
marshal (ruby-specific binary format) the contents of this structure to a file fails if file exists.
-
#dumpme!(filename) ⇒ Object
same as #dumpme but overwrites existing file.
-
#histogram(*args, &block) ⇒ Object
return histogram of value distribution for the specified key: hash of value/count pairs.
-
#key_on(*keyarray) ⇒ Object
convert an array of hashes to a hash of the same hashes where the key values are picked from the hashes the keys can be single fields, or an array, or a list options: :multi (boolean, default false): if true, allow multiple values per key; store values as an array for each key :first (boolean, default false): if true, when finding multiple values per key, store only the first and ignore the rest :truncate (integer): see ‘Hash#key_for`.
- #metrics ⇒ Object
- #nilify!(keyvalue) ⇒ Object
- #numify!(*keyarray) ⇒ Object
-
#pluck(*keys) ⇒ Object
pull out all the named attributes from the hashes in the array (into array-of-arrays).
-
#project(args) ⇒ Object
For each record, output a subset of the values as an array (suitable for passing to ‘#to_csv`) supports hierarchical subkeys (e.g. :master:id or “master:name”).
- #redundant(*keyarray) ⇒ Object
-
#resolve_all(key, &block) ⇒ Object
apply the same resolution operation to every hash in the list.
-
#subset(*keys) ⇒ Object
hash slice for all the named attributes from each hashes in the array.
-
#tsvme(filename, fields, headers = fields) ⇒ Object
# attempt to dump out contents of this array-of-hashes as CSV to named file # fields is list of attribute names to write out # options headers is public names for the fields def csvme(filename, fields, headers = fields) CSV.open(filename, “wb”) do |csv| csv << headers unless headers.nil? pluck(fields).each do |ary| csv << ary end end true end.
-
#unique?(*keyarray) ⇒ Boolean
are all the values for ‘key` defined and unique?.
- #unique_values_for(*keyarray) ⇒ Object
- #where(conditions) ⇒ Object
Instance Method Details
#allkeys ⇒ Object
What different keys appear in this collection of hashes?
150 151 152 153 154 |
# File 'lib/data_tools/array_of_hashes.rb', line 150 def allkeys each_with_object({}) do |h, memo| h.keys.each {|k| memo[k] += 1} end.keys end |
#coalesce ⇒ Object
combine a set of hashes into one for each key, find all the distinct values from all the hashes if there’s one unique value, store the single value in key of the result if there are multiple values, store them all as an array
96 97 98 99 100 101 102 103 |
# File 'lib/data_tools/array_of_hashes.rb', line 96 def coalesce allkeys = map {|h| h.keys}.flatten.uniq allkeys.reduce({}) do |memo,key| memo[key] = map {|h| h[key]}.compact.uniq memo[key] = memo[key].first if memo[key].count <= 1 memo end end |
#count_off!(key = :key, start = 0) ⇒ Object
assign unique IDs to every hash in the array argument is the name of the field to use for the generated sequential key
80 81 82 83 84 85 86 |
# File 'lib/data_tools/array_of_hashes.rb', line 80 def count_off!(key = :key, start = 0) raise "Values exist for [#{key}]" if any?{|h| h[key]} each_with_index do |hash, i| hash[key] = i + start end self end |
#dumpme(filename) ⇒ Object
marshal (ruby-specific binary format) the contents of this structure to a file fails if file exists
116 117 118 119 |
# File 'lib/data_tools/array_of_hashes.rb', line 116 def dumpme(filename) raise "#{filename} exists" if File.exists?(filename) File.open(filename, "w") {|f| f << Marshal.dump(self)} end |
#dumpme!(filename) ⇒ Object
same as #dumpme but overwrites existing file
121 122 123 124 |
# File 'lib/data_tools/array_of_hashes.rb', line 121 def dumpme!(filename) File.unlink(filename) if File.exists?(filename) File.open(filename, "w") {|f| f << Marshal.dump(self)} end |
#histogram(*args, &block) ⇒ Object
return histogram of value distribution for the specified key: hash of value/count pairs
180 181 182 183 184 185 186 187 188 189 190 |
# File 'lib/data_tools/array_of_hashes.rb', line 180 def histogram(*args, &block) reduce(Hash.new(0)) do |hist, h| if block_given? v = yield(h) else v = h[args.first] end hist[v] += 1 hist end end |
#key_on(*keyarray) ⇒ Object
convert an array of hashes to a hash of the same hashes where the key values are picked from the hashes the keys can be single fields, or an array, or a list options:
:multi (boolean, default false): if true, allow multiple values per key; store values as an array for each key
:first (boolean, default false): if true, when finding multiple values per key, store only the first and ignore the rest
:truncate (integer): see `Hash#key_for`
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/data_tools/array_of_hashes.rb', line 10 def key_on(*keyarray) raise "Key(s) required" if keyarray.empty? opts = keyarray.last.is_a?(Hash) ? keyarray.pop : {} keyarray = keyarray.flatten memo = opts[:multi] ? Hash.new {|h,k| h[k] = []} : Hash.new each do |hash| this_key = hash.key_for(keyarray, opts) raise "Missing value for #{keyarray} in record #{hash}" if this_key.nil? if opts[:multi] memo[this_key] << hash elsif opts[:first] # ignore this value if we already have one for this key if !memo.has_key?(this_key) memo[this_key] = hash end else raise "Found duplicate #{keyarray} in #{memo[this_key]} vs #{hash}" if memo.has_key?(this_key) memo[this_key] = hash end memo end memo.extend DataTools::HashOfArrays memo.default = nil memo end |
#metrics ⇒ Object
156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/data_tools/array_of_hashes.rb', line 156 def metrics allkeys.reduce({}) do |m,k| values = self.map {|h| h[k]} m[k] = { :non_nil => values.compact.count, :nil => values.count - values.compact.count, :unique => values.uniq.count } if m[k][:unique] <= 10 m[k][:values] = histogram(k) end m end end |
#nilify!(keyvalue) ⇒ Object
175 176 177 |
# File 'lib/data_tools/array_of_hashes.rb', line 175 def nilify!(keyvalue) each {|h| h.nilify!(keyvalue)} end |
#numify!(*keyarray) ⇒ Object
171 172 173 |
# File 'lib/data_tools/array_of_hashes.rb', line 171 def numify!(*keyarray) each {|h| h.numify!(*keyarray)} end |
#pluck(*keys) ⇒ Object
pull out all the named attributes from the hashes in the array (into array-of-arrays)
199 200 201 202 203 204 205 206 207 |
# File 'lib/data_tools/array_of_hashes.rb', line 199 def pluck(*keys) keys = keys.flatten map {|h| h.pluck(keys)} # if keys.count > 1 # map {|h| keys.map {|k| h[k]}} # else # map {|h| h[keys.first]} # end end |
#project(args) ⇒ Object
For each record, output a subset of the values as an array (suitable for passing to ‘#to_csv`) supports hierarchical subkeys (e.g. :master:id or “master:name”)
211 212 213 214 215 216 217 218 |
# File 'lib/data_tools/array_of_hashes.rb', line 211 def project(args) defaults = args[:defaults] || {} map do |h| args[:keys].map do |k| (k.splitkey? && (deref = h[k.superkey]) && deref[k.subkey]) || h[k] || defaults[k] || args[:nilvalue] end end end |
#redundant(*keyarray) ⇒ Object
88 89 90 |
# File 'lib/data_tools/array_of_hashes.rb', line 88 def redundant(*keyarray) key_on(keyarray, :multi => true).select {|k,v| v.count > 1} end |
#resolve_all(key, &block) ⇒ Object
apply the same resolution operation to every hash in the list
106 107 108 109 110 111 112 |
# File 'lib/data_tools/array_of_hashes.rb', line 106 def resolve_all(key, &block) map do |hash| hash = hash.dup hash[key] = hash[key].resolve(&block) hash end end |
#subset(*keys) ⇒ Object
hash slice for all the named attributes from each hashes in the array
193 194 195 196 |
# File 'lib/data_tools/array_of_hashes.rb', line 193 def subset(*keys) keys = keys.flatten map {|h| h.subset(keys) } end |
#tsvme(filename, fields, headers = fields) ⇒ Object
# attempt to dump out contents of this array-of-hashes as CSV to named file # fields is list of attribute names to write out # options headers is public names for the fields def csvme(filename, fields, headers = fields)
CSV.open(filename, "wb") do |csv|
csv << headers unless headers.nil?
pluck(fields).each do |ary|
csv << ary
end
end
true
end
139 140 141 142 143 144 145 146 147 |
# File 'lib/data_tools/array_of_hashes.rb', line 139 def tsvme(filename, fields, headers = fields) File.open(target) do |output| output.puts headers.join("\t") pluck(fields).each do |ary| output.puts ary.join("\t") end end true end |
#unique?(*keyarray) ⇒ Boolean
are all the values for ‘key` defined and unique?
64 65 66 67 68 69 70 |
# File 'lib/data_tools/array_of_hashes.rb', line 64 def unique?(*keyarray) raise "Key(s) required" if keyarray.empty? keyarray = keyarray.flatten keys = map {|hash| hash.key_for(keyarray)} return false if keys.any?(&:nil?) keys.uniq.count == self.count end |
#unique_values_for(*keyarray) ⇒ Object
72 73 74 75 76 |
# File 'lib/data_tools/array_of_hashes.rb', line 72 def unique_values_for(*keyarray) raise "Key(s) required" if keyarray.empty? keyarray = keyarray.flatten map {|hash| hash.key_for(keyarray)}.to_set end |
#where(conditions) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/data_tools/array_of_hashes.rb', line 39 def where(conditions) newhash = case conditions when Hash select do |record| conditions.map do |k,v| case v when Regexp record[k] =~ v when TrueClass !record[k].nil? when FalseClass record[k].nil? else record[k] == v end end.reduce(:&) # all tests must pass end when String,Symbol # just check for presence & non-nil value of specified key select {|record| record[conditions]} end newhash.extend DataTools::ArrayOfHashes end |