Module: RightData

Defined in:
lib/main.rb,
lib/right_data.rb,
lib/FileSystemItem.rb,
lib/FileSystemTree.rb

Defined Under Namespace

Classes: FileSystemItem, FileSystemTree

Constant Summary collapse

BLOCK_SIZE =
1024*8
IGNORE_FILES =
[".DS_Store", ".typeAttributes.dict", "empty-file"]

Class Method Summary collapse

Class Method Details

.cache_not_working_on_write(master, master_cache, indexing_function) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/main.rb', line 113

def self.cache_not_working_on_write(master, master_cache, indexing_function)
  if File.exist?(master_cache)
    puts "# Master cache FOUND at #{master_cache}."
    master_index = File.open(master_cache) do |f| 
      YAML::load(f)
    end
  else
    puts "# Master cache not found at #{master_cache}."
    master_index = indexing_function.call(master)
    puts "# Writing #{master_cache}."
    File.open(master_cache, "w") do |f| 
      YAML.dump(master_index, f)
    end  
    puts "# Wrote #{master_cache}."
  end
  master_index
end

.cache_serializing_on_write(master) ⇒ Object



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/main.rb', line 132

def self.cache_serializing_on_write(master)
  master_cache = File.join(master,".rightPruneCache")
  if File.exist?(master_cache)
    puts "# Master cache FOUND at #{master_cache}."
    master_index = File.open(master_cache) do |f| 
      rval = {}
      f.each_line do |l|
        kv = Marshal.load(l)
        rval[kv.first] = kv.last
      end
      rval
    end
  else
    puts "# Master cache not found at #{master_cache}."
    master_index = index_by_size(master)
    puts "# Writing #{master_cache}."
    File.open(master_cache, "w") do |f| 
      master_index.each_pair do |k,v|
        Marshal.dump([k,v], f)
      end
      # f.write(master_index.inspect)
    end  
    puts "# Wrote #{master_cache}."
  end
end

.check_file_in_image_index(master_index, file_to_check) ⇒ Object



164
165
166
167
168
169
170
# File 'lib/main.rb', line 164

def self.check_file_in_image_index(master_index, file_to_check)
  size = File.size(file_to_check)
  return [] if size == 0 # Ignore empty files
  possible_master_dups = master_index[File.basename(file_to_check).downcase] || []
  r = possible_master_dups.find { |master_file| self.identical_images?(master_file,file_to_check) }
  r == nil ? [] : [r] # Original check API wanted an array
end

.check_file_in_index(master_index, file_to_check) ⇒ Object



172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/main.rb', line 172

def self.check_file_in_index(master_index, file_to_check)
  size = File.size(file_to_check)
  return [] if size == 0 # Ignore empty files
  possible_master_dups = master_index[size] || []
    offset = 0
    while !possible_master_dups.empty? && offset <= size
      file_to_check_block = get_block(file_to_check, offset)
      new_possible_master_dups = []
      possible_master_dups.each do |master|
        block = get_block(master,offset)
        if(block == file_to_check_block)
          new_possible_master_dups << master
        end
      end
      possible_master_dups = new_possible_master_dups
      offset += BLOCK_SIZE
    end
  # puts possible_master_dups.inspect
  possible_master_dups
end

.dup_report(prunable) ⇒ Object

Run this in a directory that is suspected of containing self-duplicate files. Compare to: fdupes -r -n prunable



20
21
22
# File 'lib/right_data.rb', line 20

def self.dup_report(prunable)
  RightData::scan_for_dup(prunable)
end

.each_set_of_duplicates(*paths, &block) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/main.rb', line 48

def self.each_set_of_duplicates(*paths, &block)
  sizes = Hash.new {|h, k| h[k] = [] }
  Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }

  sizes.each_pair do |size, files|
  # puts files.inspect
    next unless files.size > 1
    offset = 0
    files = [files]
    while !files.empty? && offset <= size
      files = eliminate_non_duplicates(files, size, offset, &block)
      offset += BLOCK_SIZE
    end
  end
end

.eliminate_non_duplicates(partition, size, offset) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/main.rb', line 64

def self.eliminate_non_duplicates(partition, size, offset)
  possible_duplicates = []
  partition.each do |possible_duplicate_set|
    blocks = Hash.new {|h, k| h[k] = [] }
    possible_duplicate_set.each do |f|
      block = open(f, 'rb') do |file|
        file.seek(offset)
        file.read(BLOCK_SIZE)
      end
      blocks[block || ''] << f
    end
    blocks.each_value do |files|
      if files.size > 1
        if offset+BLOCK_SIZE >= size
          # We know these are duplicates.
          yield files
        else
          # We suspect these are duplicates, but we need to compare
          # more blocks of data.
          possible_duplicates << files
        end
      end
    end
  end
 return possible_duplicates
end

.get_block(file, offset) ⇒ Object



158
159
160
161
162
# File 'lib/main.rb', line 158

def self.get_block(file,offset)
  open(file, 'r') do |f|
    f.seek(offset); f.read(BLOCK_SIZE)
  end
end

.git?(path) ⇒ Boolean

Returns:

  • (Boolean)


318
319
320
# File 'lib/main.rb', line 318

def self.git?(path)
  File.directory?(File.join(path, ".git"))
end

.helloObject



4
# File 'lib/right_data.rb', line 4

def self.hello; "Hi!"; end

.identical_images?(a, b) ⇒ Boolean

Returns:

  • (Boolean)


29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/main.rb', line 29

def self.identical_images?(a,b)
  return false unless self.is_visual_media?(a) && self.is_visual_media?(b)
  # rmagick1.signature <=> rmagick2.signature
  # rmagick1.compare_channel(rmagick2, MeanAbsoluteErrorMetric).last == 0

  if true # Do fuzzy test:
    cmd = "compare -metric AE -fuzz 5% -compose src \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null 2>&1"
    `#{cmd}`.chomp.to_i < 1000
  else
    cmd = "compare -metric AE -compose src \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null 2>&1"
    # puts "Executing comparison: #{cmd}"
    # >> a = `compare -metric ae a.png b.png /dev/null 2>&1`
    # => "0\n"
    "0" == `#{cmd}`.chomp
  end

  # TODO Consider checking rotated 90,180,270 degrees and scaled to other image...
end

.ignore_test(f) ⇒ Object



15
16
17
18
19
20
21
# File 'lib/main.rb', line 15

def self.ignore_test(f)
  IGNORE_FILES.include?(File.basename(f)) || 
    File.symlink?(f) || 
    (File.size(f) == 0) || # Ignore empty files
    File.basename(f).downcase =~ /\.tmp$/ ||
    File.basename(f).downcase =~ /\.swp$/
end

.index_by_name(*paths) ⇒ Object



91
92
93
94
95
96
97
98
99
100
# File 'lib/main.rb', line 91

def self.index_by_name(*paths)
  names = Hash.new {|h, k| h[k] = [] }
  count = 0
  Find.find(*paths) { |f| 
    names[File.basename(f).downcase] << f if File.file?(f) && !ignore_test(f)
    count += 1
  }
  puts "# Indexed #{count} files by name."
  names
end

.index_by_size(*paths) ⇒ Object



102
103
104
105
106
107
108
109
110
111
# File 'lib/main.rb', line 102

def self.index_by_size(*paths)
  sizes = Hash.new {|h, k| h[k] = [] }
  count = 0
  Find.find(*paths) { |f| 
    sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
    count += 1
  }
  puts "# Indexed #{count} files by size."
  sizes
end

.is_visual_media?(f) ⇒ Boolean

Is this a picture? If so, we’ll be using imagemagick’s compare feature later on

Returns:

  • (Boolean)


24
25
26
27
# File 'lib/main.rb', line 24

def self.is_visual_media?(f)
  ext = File.basename(f).downcase.split(".").last 
  ["jpg","jpeg","gif","bmp","png"].include?(ext)
end

.prune_image_report(master, prunable) ⇒ Object



13
14
15
16
# File 'lib/right_data.rb', line 13

def self.prune_image_report(master,prunable)
  tree = RightData::scan_for_prunable_images(master,prunable) 
  tree.report('rm -rf'); nil
end

.prune_report(master, prunable) ⇒ Object

Run this in a directory (prunable) that is suspected of containing duplicate files that already exist in master. E.g. check a discovered backup drive and whether anything on it is valid



8
9
10
11
# File 'lib/right_data.rb', line 8

def self.prune_report(master,prunable)
  tree = RightData::scan_for_prunable(master,prunable) 
  tree.report('rm -rf'); nil
end

.repo_report(search_dir) ⇒ Object

Run this on a directory that is suspected of containing unchecked in GIT or SVN repos. Get back a list of all repos, versions and whether any files are unchecked in.



26
27
28
# File 'lib/right_data.rb', line 26

def self.repo_report(search_dir)
  tree = RightData::scan_for_repos(search_dir) 
end

.scan_for_dup(prunable) ⇒ Object



200
201
202
203
204
205
206
207
# File 'lib/main.rb', line 200

def self.scan_for_dup(prunable)
  each_set_of_duplicates(prunable) do |dups|
    puts "# #{Escape.shell_command(dups.shift)}"
    dups.each do |d|
      puts Escape.shell_command(["rm","-rf",d," # dup"])
    end
  end
end

.scan_for_prunable(master, prune, &block) ⇒ Object



215
216
217
218
219
# File 'lib/main.rb', line 215

def self.scan_for_prunable(master, prune, &block)
  indexing_function    = Proc.new { |a| self.index_by_size(a) }
  check_index_function = Proc.new { |a,b| self.check_file_in_index(a,b) }
  scan_for_prunable_base(master, prune, indexing_function, check_index_function, "size", &block)
end

.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block) ⇒ Object

tree = scan_for_prunable(master,prune) { |a,b| puts “#bb.size : #a” }; nil



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/main.rb', line 222

def self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block)
  puts "# Ignoring: #{IGNORE_FILES.inspect}"

  master_cache = File.join(master,".rightPruneCache-#{kind}")
  master_index = cache_not_working_on_write(master, master_cache, indexing_function)

  # master_index = index_by_size(master)
  puts "# Found #{master_index.size} unique #{kind}s."

  # dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")

  # Get prune count for progress updates:
  prune_count = `find "#{prune}" | wc -l`.chomp.to_i
  prune_updates = (0..25).inject({}) { |a,i| a[(i * (prune_count / 25)).to_i]=true;a }

  count = 0

  # Recursively compare the files in the filesystem.
  # When a parent node gets a response from all its children
  # that they are dups OR ignorable, that NODE becomes dup_or_ignorable too.
  # This propagates.
  # Then, there is a traversal that grabs all base nodes that are non_dup like:
  # rm -rf /a_path_duped/here     # 14 dups / 9 ignores
  # rm -rf /b_path_duped/way/here # 1 dup
  tree = FileSystemItem.new(prune, :parent => nil)
  # Mark the nodes:
  tree.traverse do |n|
    puts "# [#{count} / #{prune_count}] #{((count/prune_count)*100).to_i}%" if prune_updates[count]
    # Could keep track of empty dirs too...
    if File.directory?(n.path)
      # If empty dir...
      if n.leaf?
        n.ignorable = true
        n.parent.increment_ignorable_children
        next false # Don't bother, no kids
      else
        next true
      end
    end
    count += 1
    if ignore_test(n.path)
      n.ignorable = true
      n.parent.increment_ignorable_children
    else
      # puts n.path
      duplicates = check_index_function.call(master_index, n.path)
      if(!duplicates.empty?) 
        n.duplicates = duplicates
        n.parent.increment_duplicate_children
      end
    end
    true
  end
  puts "# We counted #{count} files. Tree thinks it has #{tree.files}."
  return tree

  if nil
  Find.find(prune) { |f|
    if File.directory? f
      puts "Dir: #{f}"
      prunable_dirs[f] = {}
      next
    end
    # next unless File.file? f
    count += 1
    duplicates = check_file_in_index(master_index, f)
    if(!duplicates.empty?) 
      dups[f] = duplicates
      prunable_files[f] = duplicates
      block.call(f, duplicates) unless block.nil?
    else
      prunable_files[f] = false
    end
  }

  puts "After check. Found #{dups.size} / #{count} dups in master."
  puts "After check. Found #{dups.first.inspect}"
  end

  # puts "Dirs scanned."
  #prunable_dirs.each_pair do |file,prunable|
    #puts "#{'#' if !prunable} #{file}"
  #end

  # puts "Files scanned."
  # prunable_files.keys.sort.each do |file|
    # prunable = prunable_files[file]
    # puts "#{'#' if !prunable} #{file}"
  # end
  # prunable_files
end

.scan_for_prunable_images(master, prune, &block) ⇒ Object



209
210
211
212
213
# File 'lib/main.rb', line 209

def self.scan_for_prunable_images(master, prune, &block)
  indexing_function    = Proc.new { |a| self.index_by_name(a) }
  check_index_function = Proc.new { |a,b| self.check_file_in_image_index(a,b) }
  self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, "image", &block)
end

.scan_for_repos(prune, &block) ⇒ Object



321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# File 'lib/main.rb', line 321

def self.scan_for_repos(prune, &block)
  tree = FileSystemTree.new(prune, :parent => nil)
  repos = {}
  # Mark the nodes:
  tree.traverse do |n|
    if File.directory?(n.path)
      if svn?(n.path)
        cd_cmd = Escape.shell_command(["cd",n.path])
        status = `#{cd_cmd}; svn status`
        info   = `#{cd_cmd}; svn info`
        repos[n.path] = { :kind => "svn", :status => status, :info => info }
      end
      if git?(n.path)
        cd_cmd = Escape.shell_command(["cd",n.path])
        status = `#{cd_cmd}; git status`
        info   = `#{cd_cmd}; git show`
        repos[n.path] = { :kind => "git", :status => status, :info => info }
      end
      !repos[n.path] # recurse only if we DID NOT find a repo
    end
  end
  repos.keys.sort.each do |k|
    puts "Found #{repos[k][:kind]} repo at: #{k}. \n\tStatus: #{repos[k][:status]}"
  end
  return repos
end

.svn?(path) ⇒ Boolean

This is a weak check! Also does nothing to check one svn in another.

Returns:

  • (Boolean)


315
316
317
# File 'lib/main.rb', line 315

def self.svn?(path)
  File.directory?(File.join(path, ".svn"))
end

.testObject



193
194
195
196
197
198
# File 'lib/main.rb', line 193

def self.test
  master = "/Users/jonathan/Dropbox"
  prune  = "/Users/jonathan/Desktop/Old"
  scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }
  # each_set_of_duplicates(prune) 
end