Module: IMW::Tools::ExtensionAnalyzer

Included in:
Summarizer
Defined in:
lib/imw/tools/extension_analyzer.rb

Overview

Mixin with some heuristic methods for identifying common extensions and likely data formats for a collection of files.

Requires the including class to define a method resources which returns an array of IMW::Resource objects as well as a method total_size which gives the total size of the resources (for weighting extensions by size).

Instance Method Summary collapse

Instance Method Details

#extension_countsHash

Return the file counts of each extension.

Returns:



16
17
18
19
20
21
22
23
24
# File 'lib/imw/tools/extension_analyzer.rb', line 16

def extension_counts
  @extension_counts ||= returning({}) do |counts|
    resources.each do |resource|
      next if resource.is_directory?
      counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
      counts[resource.extension] += 1
    end
  end
end

#extension_sizesHash

Return the amount of data corresponding to each extension.

Returns:



53
54
55
56
57
58
59
60
61
# File 'lib/imw/tools/extension_analyzer.rb', line 53

def extension_sizes
  @extension_sizes ||= returning({}) do |sizes|
    resources.each do |resource|
      next if resource.is_directory?
      sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)            
      sizes[resource.extension] += resource.size
    end
  end
end

#most_common_data_formatString

Returns a guess as to the most common data format for this Summarizer’s resources.

Returns:



107
108
109
110
# File 'lib/imw/tools/extension_analyzer.rb', line 107

def most_common_data_format
  extension = most_common_extension
  ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
end

#most_common_extensionString

Return a guess as to the most common extension format for this Summarizer’s resources.

Returns:



94
95
96
97
98
99
100
101
# File 'lib/imw/tools/extension_analyzer.rb', line 94

def most_common_extension
  return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
  count_fraction = (normalized_extension_counts[most_common_extension_by_count] or 0.0)
  size_fraction  = (normalized_extension_sizes[most_common_extension_by_size]   or 0.0)
  return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
  return most_common_extension_by_size  if count_fraction < 0.5  and size_fraction >= 0.5
  most_common_extension_by_size # default to size
end

#most_common_extension_by_countObject

Return the most common extension by count of files.



27
28
29
30
31
32
33
34
35
# File 'lib/imw/tools/extension_analyzer.rb', line 27

def most_common_extension_by_count
  return @most_common_extension_by_count if @most_common_extension_by_count
  current_count, current_extension = 0, ''
  extension_counts.each_pair do |extension, count|
    current_extension = extension if count > current_count
  end
  if current_extension.strip.blank? then current_extension = 'flat' end
  @most_common_extension_by_count = current_extension
end

#most_common_extension_by_sizeString

Return the most common extension by amount of data.

Returns:



66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/imw/tools/extension_analyzer.rb', line 66

def most_common_extension_by_size
  return @most_common_extension_by_size if @most_common_extension_by_size
  current_size, current_extension = 0, ''
  extension_sizes.each_pair do |extension, size|
    if size > current_size
      current_extension = extension
      current_size      = size
    end
  end
  current_extension = 'flat' if current_extension.strip.blank?
  @most_common_extension_by_size = current_extension
end

#normalized_extension_countsHash

Return the file counts of each extension, normalized by the total number of files.

Returns:



41
42
43
44
45
46
47
48
# File 'lib/imw/tools/extension_analyzer.rb', line 41

def normalized_extension_counts
  @normalized_extension_counts ||= returning({}) do |weighted|
    num_files = resources.reject(&:is_directory?).length.to_f
    extension_counts.each_pair do |extension, count|
      weighted[extension] = count.to_f / num_files
    end
  end
end

#normalized_extension_sizesHash

Return the fractional share of each extension by file size.

Returns:



82
83
84
85
86
87
88
# File 'lib/imw/tools/extension_analyzer.rb', line 82

def normalized_extension_sizes
  @normalized_extension_sizes ||= returning({}) do |weighted|
    extension_sizes.each_pair do |extension, size|
      weighted[extension] = size.to_f / total_size.to_f
    end
  end
end