Class: IMW::Tools::Summarizer

Inherits:
Object
  • Object
show all
Defined in:
lib/imw/tools/summarizer.rb

Overview

A class for producing summary data about a collection of resources.

This summary data includes the directory tree, file sizes, file formats, record counts, &c.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*inputs) ⇒ IMW::Tools::Summarizer

Initialize a new Summarizer with the given inputs.

Parameters:



18
19
20
# File 'lib/imw/tools/summarizer.rb', line 18

def initialize *inputs
  self.inputs = inputs.flatten
end

Instance Attribute Details

#inputsObject

The inputs to this Summarizer.



12
13
14
# File 'lib/imw/tools/summarizer.rb', line 12

def inputs
  @inputs
end

Instance Method Details

#clear_cached_statistics!Object

Reset all the cached statistics of this summarizer to nil.



37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/imw/tools/summarizer.rb', line 37

def clear_cached_statistics!
  [:num_files,
   :num_direcories,
   :total_size,
   :extension_counts,
   :most_common_extension_by_count,
   :normalized_extension_counts,
   :extension_sizes,
   :most_common_extension_by_size,
   :normalized_extension_sizes].each do |instance_variable|
    self.instance_variable_set("@#{instance_variable}", nil)
  end
end

#extension_countsHash

Return the file counts of each extension.

Returns:



75
76
77
78
79
80
81
82
83
# File 'lib/imw/tools/summarizer.rb', line 75

def extension_counts
  @extension_counts ||= returning({}) do |counts|
    inputs.each do |input|
      next if input.is_directory?
      counts[input.extension] = 0 unless counts.has_key?(input.extension)
      counts[input.extension] += 1
    end
  end
end

#extension_sizesHash

Return the amount of data corresponding to each extension.

Returns:



111
112
113
114
115
116
117
118
119
# File 'lib/imw/tools/summarizer.rb', line 111

def extension_sizes
  @extension_sizes ||= returning({}) do |sizes|
    inputs.each do |input|
      next if input.is_directory?
      sizes[input.extension] = 0 unless sizes.has_key?(input.extension)            
      sizes[input.extension] += input.size
    end
  end
end

#most_common_data_formatString

Returns a guess as to the most common data format for this Summarizer’s inputs.

Returns:



162
163
164
165
# File 'lib/imw/tools/summarizer.rb', line 162

def most_common_data_format
  extension = most_common_extension
  ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
end

#most_common_extensionString

Return a guess as to the most common extension format for this Summarizer’s inputs.

Returns:



149
150
151
152
153
154
155
156
# File 'lib/imw/tools/summarizer.rb', line 149

def most_common_extension
  return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
  count_fraction = normalized_extension_counts[most_common_extension_by_count]
  size_fraction  = normalized_extension_sizes[most_common_extension_by_size]
  return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
  return most_common_extension_by_size  if count_fraction < 0.5 and size_fraction > 0.5
  most_common_extension_by_size # default to size
end

#most_common_extension_by_countObject

Return the most common extension by count of files.



86
87
88
89
90
91
92
93
94
# File 'lib/imw/tools/summarizer.rb', line 86

def most_common_extension_by_count
  return @most_common_extension_by_count if @most_common_extension_by_count
  current_count, current_extension = 0, nil
  extension_counts.each_pair do |extension, count|
    current_extension = extension if count > current_count
  end
  if current_extension.strip.blank? then current_extension = 'flat' end
  @most_common_extension_by_count = current_extension
end

#most_common_extension_by_sizeString

Return the most common extension by amount of data.

Returns:



124
125
126
127
128
129
130
131
132
# File 'lib/imw/tools/summarizer.rb', line 124

def most_common_extension_by_size
  return @most_common_extension_by_size if @most_common_extension_by_size
  current_size, current_extension = 0, nil
  extension_sizes.each_pair do |extension, size|
    current_extension = extension if size > current_size
  end
  if current_extension.strip.blank? then current_extension = 'flat' end
  @most_common_extension_by_size = current_extension
end

#normalized_extension_countsHash

Return the file counts of each extension, normalized by the total number of files.

Returns:



100
101
102
103
104
105
106
# File 'lib/imw/tools/summarizer.rb', line 100

def normalized_extension_counts
  @normalized_extension_counts ||= returning({}) do |weighted|
    extension_counts.each_pair do |extension, count|
      weighted[extension] = count.to_f / num_files.to_f
    end
  end
end

#normalized_extension_sizesHash

Return the fractional share of each extension by file size.

Returns:



137
138
139
140
141
142
143
# File 'lib/imw/tools/summarizer.rb', line 137

def normalized_extension_sizes
  @normalized_extension_sizes ||= returning({}) do |weighted|
    extension_sizes.each_pair do |extension, size|
      weighted[extension] = size.to_f / total_size.to_f
    end
  end
end

#num_directoriesInteger

Return the number of directories.

Returns:

  • (Integer)


61
62
63
# File 'lib/imw/tools/summarizer.rb', line 61

def num_directories
  @num_directories ||= inputs.collect { |input| input.is_directory? }
end

#num_filesInteger

Return the number of files.

Returns:

  • (Integer)


54
55
56
# File 'lib/imw/tools/summarizer.rb', line 54

def num_files
  @num_files ||= inputs.size
end

#total_sizeInteger

Return the total size.

Returns:

  • (Integer)


68
69
70
# File 'lib/imw/tools/summarizer.rb', line 68

def total_size
  @total_size ||= inputs.map(&:size).inject(0) { |e, sum| sum += e }
end