Module: DataTools::ArrayOfHashes

Defined in:
lib/data_tools/array_of_hashes.rb

Instance Method Summary collapse

Instance Method Details

#allkeysObject

What different keys appear in this collection of hashes?



150
151
152
153
154
# File 'lib/data_tools/array_of_hashes.rb', line 150

def allkeys
  each_with_object({}) do |h, memo|
    h.keys.each {|k| memo[k] += 1}
  end.keys
end

#coalesceObject

combine a set of hashes into one for each key, find all the distinct values from all the hashes if there’s one unique value, store the single value in key of the result if there are multiple values, store them all as an array



96
97
98
99
100
101
102
103
# File 'lib/data_tools/array_of_hashes.rb', line 96

def coalesce
  allkeys = map {|h| h.keys}.flatten.uniq
  allkeys.reduce({}) do |memo,key|
    memo[key] = map {|h| h[key]}.compact.uniq
    memo[key] = memo[key].first if memo[key].count <= 1
    memo
  end
end

#count_off!(key = :key, start = 0) ⇒ Object

assign unique IDs to every hash in the array argument is the name of the field to use for the generated sequential key



80
81
82
83
84
85
86
# File 'lib/data_tools/array_of_hashes.rb', line 80

def count_off!(key = :key, start = 0)
  raise "Values exist for [#{key}]" if any?{|h| h[key]}
  each_with_index do |hash, i|
    hash[key] = i + start
  end
  self
end

#dumpme(filename) ⇒ Object

marshal (ruby-specific binary format) the contents of this structure to a file fails if file exists



116
117
118
119
# File 'lib/data_tools/array_of_hashes.rb', line 116

def dumpme(filename)
  raise "#{filename} exists" if File.exists?(filename)
  File.open(filename, "w") {|f| f << Marshal.dump(self)}
end

#dumpme!(filename) ⇒ Object

same as #dumpme but overwrites existing file



121
122
123
124
# File 'lib/data_tools/array_of_hashes.rb', line 121

def dumpme!(filename)
  File.unlink(filename) if File.exists?(filename)
  File.open(filename, "w") {|f| f << Marshal.dump(self)}
end

#histogram(*args, &block) ⇒ Object

return histogram of value distribution for the specified key: hash of value/count pairs



180
181
182
183
184
185
186
187
188
189
190
# File 'lib/data_tools/array_of_hashes.rb', line 180

def histogram(*args, &block)
  reduce(Hash.new(0)) do |hist, h|
    if block_given?
      v = yield(h)
    else
      v = h[args.first]
    end
    hist[v] += 1
    hist
  end
end

#key_on(*keyarray) ⇒ Object

convert an array of hashes to a hash of the same hashes where the key values are picked from the hashes the keys can be single fields, or an array, or a list options:

:multi (boolean, default false): if true, allow multiple values per key; store values as an array for each key
:first (boolean, default false): if true, when finding multiple values per key, store only the first and ignore the rest
:truncate (integer): see `Hash#key_for`


10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/data_tools/array_of_hashes.rb', line 10

def key_on(*keyarray)
  raise "Key(s) required" if keyarray.empty?
  opts = keyarray.last.is_a?(Hash) ? keyarray.pop : {}
  keyarray = keyarray.flatten

  memo = opts[:multi] ? Hash.new {|h,k| h[k] = []} : Hash.new
  each do |hash|
    this_key = hash.key_for(keyarray, opts)
    raise "Missing value for #{keyarray} in record #{hash}" if this_key.nil?
    if opts[:multi]
      memo[this_key] << hash
    elsif opts[:first]
      # ignore this value if we already have one for this key
      if !memo.has_key?(this_key)
        memo[this_key] = hash
      end
    else
      raise "Found duplicate #{keyarray} in #{memo[this_key]} vs #{hash}" if memo.has_key?(this_key)
      memo[this_key] = hash
    end
    memo
  end
  memo.extend DataTools::HashOfArrays
  memo.default = nil
  memo
end

#metricsObject



156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/data_tools/array_of_hashes.rb', line 156

def metrics
  allkeys.reduce({}) do |m,k|
    values = self.map {|h| h[k]}
    m[k] = {
      :non_nil => values.compact.count,
      :nil => values.count - values.compact.count,
      :unique => values.uniq.count
    }
    if m[k][:unique] <= 10
      m[k][:values] = histogram(k)
    end
    m
  end
end

#nilify!(keyvalue) ⇒ Object



175
176
177
# File 'lib/data_tools/array_of_hashes.rb', line 175

def nilify!(keyvalue)
  each {|h| h.nilify!(keyvalue)}
end

#numify!(*keyarray) ⇒ Object



171
172
173
# File 'lib/data_tools/array_of_hashes.rb', line 171

def numify!(*keyarray)
  each {|h| h.numify!(*keyarray)}
end

#pluck(*keys) ⇒ Object

pull out all the named attributes from the hashes in the array (into array-of-arrays)



199
200
201
202
203
204
205
206
207
# File 'lib/data_tools/array_of_hashes.rb', line 199

def pluck(*keys)
  keys = keys.flatten
  map {|h| h.pluck(keys)}
  # if keys.count > 1
  #   map {|h| keys.map {|k| h[k]}}
  # else
  #   map {|h| h[keys.first]}
  # end
end

#project(args) ⇒ Object

For each record, output a subset of the values as an array (suitable for passing to ‘#to_csv`) supports hierarchical subkeys (e.g. :master:id or “master:name”)



211
212
213
214
215
216
217
218
# File 'lib/data_tools/array_of_hashes.rb', line 211

def project(args)
  defaults = args[:defaults] || {}
  map do |h|
    args[:keys].map do |k|
      (k.splitkey? && (deref = h[k.superkey]) && deref[k.subkey]) || h[k] || defaults[k] || args[:nilvalue]
    end
  end
end

#redundant(*keyarray) ⇒ Object



88
89
90
# File 'lib/data_tools/array_of_hashes.rb', line 88

def redundant(*keyarray)
  key_on(keyarray, :multi => true).select {|k,v| v.count > 1}
end

#resolve_all(key, &block) ⇒ Object

apply the same resolution operation to every hash in the list



106
107
108
109
110
111
112
# File 'lib/data_tools/array_of_hashes.rb', line 106

def resolve_all(key, &block)
  map do |hash|
    hash = hash.dup
    hash[key] = hash[key].resolve(&block)
    hash
  end
end

#subset(*keys) ⇒ Object

hash slice for all the named attributes from each hashes in the array



193
194
195
196
# File 'lib/data_tools/array_of_hashes.rb', line 193

def subset(*keys)
  keys = keys.flatten
  map {|h| h.subset(keys) }
end

#tsvme(filename, fields, headers = fields) ⇒ Object

# attempt to dump out contents of this array-of-hashes as CSV to named file # fields is list of attribute names to write out # options headers is public names for the fields def csvme(filename, fields, headers = fields)

CSV.open(filename, "wb") do |csv|
  csv << headers unless headers.nil?
  pluck(fields).each do |ary|
    csv << ary
  end
end
true

end



139
140
141
142
143
144
145
146
147
# File 'lib/data_tools/array_of_hashes.rb', line 139

def tsvme(filename, fields, headers = fields)
  File.open(target) do |output|
    output.puts headers.join("\t")
    pluck(fields).each do |ary|
      output.puts ary.join("\t")
    end
  end
  true
end

#unique?(*keyarray) ⇒ Boolean

are all the values for ‘key` defined and unique?

Returns:

  • (Boolean)


64
65
66
67
68
69
70
# File 'lib/data_tools/array_of_hashes.rb', line 64

def unique?(*keyarray)
  raise "Key(s) required" if keyarray.empty?
  keyarray = keyarray.flatten
  keys = map {|hash| hash.key_for(keyarray)}
  return false if keys.any?(&:nil?)
  keys.uniq.count == self.count
end

#unique_values_for(*keyarray) ⇒ Object



72
73
74
75
76
# File 'lib/data_tools/array_of_hashes.rb', line 72

def unique_values_for(*keyarray)
  raise "Key(s) required" if keyarray.empty?
  keyarray = keyarray.flatten
  map {|hash| hash.key_for(keyarray)}.to_set
end

#where(conditions) ⇒ Object

shorthand for ‘Array#select {|hash| hash && hash …}` find all the members of the array where all the specified criteria are true



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/data_tools/array_of_hashes.rb', line 39

def where(conditions)
  newhash = case conditions
  when Hash
    select do |record|
      conditions.map do |k,v|
        case v
        when Regexp
          record[k] =~ v
        when TrueClass
          !record[k].nil?
        when FalseClass
          record[k].nil?
        else
          record[k] == v
        end
      end.reduce(:&) # all tests must pass
    end
  when String,Symbol
    # just check for presence & non-nil value of specified key
    select {|record| record[conditions]}
  end
  newhash.extend DataTools::ArrayOfHashes
end