Class: GitFastClone::Runner

Inherits:
Object
  • Object
show all
Includes:
UrlHelper, RunnerExecution
Defined in:
lib/git-fastclone.rb

Overview

Spawns one thread per submodule, and updates them in parallel. They will be cached in the reference directory (see DEFAULT_REFERENCE_REPO_DIR), and their index will be incrementally updated. This prevents a large amount of data copying.

Constant Summary collapse

DEFAULT_REFERENCE_REPO_DIR =
'/var/tmp/git-fastclone/reference'
DEFAULT_GIT_ALLOW_PROTOCOL =
'file:git:http:https:ssh'

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from RunnerExecution

check_status, debug_print_cmd_list, exit_on_status, fail_on_error, logger, popen2e_wrapper, print_command, shell_safe, tee

Methods included from UrlHelper

parse_update_info, path_from_git_url, reference_repo_dir, reference_repo_lock_file, reference_repo_name, reference_repo_submodule_file

Constructor Details

#initializeRunner

Returns a new instance of Runner.



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/git-fastclone.rb', line 80

def initialize
  # Prefetch reference repos for submodules we've seen before
  # Keep our own reference accounting of module dependencies.
  self.prefetch_submodules = true

  # Thread-level locking for reference repos
  # TODO: Add flock-based locking if we want to avoid conflicting with
  # ourselves.
  self.reference_mutex = Hash.new { |hash, key| hash[key] = Mutex.new }

  # Only update each reference repo once per run.
  # TODO: May want to update this so we don't duplicate work with other copies
  # of ourself. Perhaps a last-updated-time and a timeout per reference repo.
  self.reference_updated = Hash.new { |hash, key| hash[key] = false }

  self.options = {}

  self.abs_clone_path = Dir.pwd

  self.using_local_repo = false

  self.verbose = false

  self.print_git_errors = false

  self.color = false

  self.flock_timeout_secs = 0
end

Instance Attribute Details

#abs_clone_pathObject

Returns the value of attribute abs_clone_path.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def abs_clone_path
  @abs_clone_path
end

#colorObject

Returns the value of attribute color.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def color
  @color
end

#flock_timeout_secsObject

Returns the value of attribute flock_timeout_secs.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def flock_timeout_secs
  @flock_timeout_secs
end

#optionsObject

Returns the value of attribute options.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def options
  @options
end

#prefetch_submodulesObject

Returns the value of attribute prefetch_submodules.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def prefetch_submodules
  @prefetch_submodules
end

Returns the value of attribute print_git_errors.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def print_git_errors
  @print_git_errors
end

#reference_dirObject

Returns the value of attribute reference_dir.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def reference_dir
  @reference_dir
end

#reference_mutexObject

Returns the value of attribute reference_mutex.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def reference_mutex
  @reference_mutex
end

#reference_updatedObject

Returns the value of attribute reference_updated.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def reference_updated
  @reference_updated
end

#using_local_repoObject

Returns the value of attribute using_local_repo.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def using_local_repo
  @using_local_repo
end

#verboseObject

Returns the value of attribute verbose.



76
77
78
# File 'lib/git-fastclone.rb', line 76

def verbose
  @verbose
end

Instance Method Details

#clear_cache(dir, url) ⇒ Object

To avoid corruption of the cache, if we failed to update or check out we remove the cache directory entirely. This may cause the current clone to fail, but if the underlying error from git is transient it will not affect future clones.



382
383
384
385
386
# File 'lib/git-fastclone.rb', line 382

def clear_cache(dir, url)
  puts "[WARN] Removing the fastclone cache at #{dir}"
  FileUtils.remove_entry_secure(dir, force: true)
  reference_updated.delete(reference_repo_name(url))
end

#clear_clone_dest(dest_files) ⇒ Object



202
203
204
205
# File 'lib/git-fastclone.rb', line 202

def clear_clone_dest(dest_files)
  puts 'Non-empty clone directory found, clearing its content now.'
  FileUtils.rm_rf(dest_files)
end

#clear_clone_dest_if_needed(attempt_number, clone_dest) ⇒ Object



192
193
194
195
196
197
198
199
200
# File 'lib/git-fastclone.rb', line 192

def clear_clone_dest_if_needed(attempt_number, clone_dest)
  return unless attempt_number.positive?

  dest_with_dotfiles = Dir.glob("#{clone_dest}/*", File::FNM_DOTMATCH)
  dest_files = dest_with_dotfiles.reject { |f| %w[. ..].include?(File.basename(f)) }
  return if dest_files.empty?

  clear_clone_dest(dest_files)
end

#clone(url, rev, src_dir, config) ⇒ Object

Checkout to SOURCE_DIR. Update all submodules recursively. Use reference repos everywhere for speed.



209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/git-fastclone.rb', line 209

def clone(url, rev, src_dir, config)
  clone_dest = File.join(abs_clone_path, src_dir).to_s
  initial_time = Time.now

  if Dir.exist?(clone_dest) && !Dir.empty?(clone_dest)
    raise "Can't clone into an existing non-empty path: #{clone_dest}"
  end

  with_git_mirror(url) do |mirror, attempt_number|
    clear_clone_dest_if_needed(attempt_number, clone_dest)

    clone_commands = ['git', 'clone', verbose ? '--verbose' : '--quiet']
    clone_commands << '--reference' << mirror.to_s << url.to_s << clone_dest
    clone_commands << '--config' << config.to_s unless config.nil?
    fail_on_error(*clone_commands, quiet: !verbose, print_on_failure: print_git_errors)
  end

  # Only checkout if we're changing branches to a non-default branch
  if rev
    fail_on_error('git', 'checkout', '--quiet', rev.to_s, quiet: !verbose,
                                                          print_on_failure: print_git_errors,
                                                          chdir: File.join(abs_clone_path, src_dir))
  end

  update_submodules(src_dir, url)

  final_time = Time.now

  msg = "Checkout of #{src_dir} took #{final_time - initial_time}s"
  if color
    puts msg.green
  else
    puts msg
  end
end

#parse_inputsObject



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/git-fastclone.rb', line 162

def parse_inputs
  parse_options

  unless ARGV[0]
    warn usage
    exit(129)
  end

  if Dir.exist?(ARGV[0])
    url = File.expand_path ARGV[0]
    self.using_local_repo = true
  else
    url = ARGV[0]
  end

  path = ARGV[1] || path_from_git_url(url)

  if Dir.exist?(path)
    msg = "Clone destination #{File.join(abs_clone_path, path)} already exists!"
    raise msg.red if color

    raise msg
  end

  self.reference_dir = ENV['REFERENCE_REPO_DIR'] || DEFAULT_REFERENCE_REPO_DIR
  FileUtils.mkdir_p(reference_dir)

  [url, path, options]
end

#parse_optionsObject



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/git-fastclone.rb', line 126

def parse_options
  # One option --branch=<branch>  We're not as brittle as clone. That branch
  # can be a sha or tag and we're still okay.
  OptionParser.new do |opts|
    opts.banner = usage
    options[:branch] = nil

    opts.on('-b', '--branch BRANCH', 'Checkout this branch rather than the default') do |branch|
      options[:branch] = branch
    end

    opts.on('-v', '--verbose', 'Verbose mode') do
      puts '--print_git_errors is redundant when using --verbose' if print_git_errors
      self.verbose = true
    end

    opts.on('--print_git_errors', 'Print git output if a command fails') do
      puts '--print_git_errors is redundant when using --verbose' if verbose
      self.print_git_errors = true
    end

    opts.on('-c', '--color', 'Display colored output') do
      self.color = true
    end

    opts.on('--config CONFIG', 'Git config applied to the cloned repo') do |config|
      options[:config] = config
    end

    opts.on('--lock-timeout N', 'Timeout in seconds to acquire a lock on any reference repo.
            Default is 0 which waits indefinitely.') do |timeout_secs|
      self.flock_timeout_secs = timeout_secs.to_i
    end
  end.parse!
end

#prefetch(submodule_file) ⇒ Object

Grab the children in the event of a prefetch



334
335
336
337
338
339
# File 'lib/git-fastclone.rb', line 334

def prefetch(submodule_file)
  File.readlines(submodule_file).each do |line|
    # We don't join these threads explicitly
    Thread.new { update_reference_repo(line.strip, false) }
  end
end


374
375
376
377
# File 'lib/git-fastclone.rb', line 374

def print_formatted_error(error)
  indented_error = error.to_s.split("\n").map { |s| ">  #{s}\n" }.join
  puts "[INFO] Encountered a retriable error:\n#{indented_error}\n"
end

#retriable_error?(error) ⇒ Boolean

Returns:

  • (Boolean)


361
362
363
364
365
366
367
368
369
370
371
372
# File 'lib/git-fastclone.rb', line 361

def retriable_error?(error)
  error_strings = [
    /^fatal: missing blob object/,
    /^fatal: remote did not send all necessary objects/,
    /^fatal: packed object [a-z0-9]+ \(stored in .*?\) is corrupt/,
    /^fatal: pack has \d+ unresolved delta/,
    /^error: unable to read sha1 file of /,
    /^fatal: did not receive expected object/,
    /^fatal: unable to read tree [a-z0-9]+\n^warning: Clone succeeded, but checkout failed/
  ]
  error.to_s =~ /.*#{Regexp.union(error_strings)}/m
end

#runObject



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/git-fastclone.rb', line 110

def run
  url, path, options = parse_inputs

  require_relative './git-fastclone/version'
  msg = "git-fastclone #{GitFastCloneVersion::VERSION}"
  if color
    puts msg.yellow
  else
    puts msg
  end

  puts "Cloning #{path_from_git_url(url)} to #{File.join(abs_clone_path, path)}"
  ENV['GIT_ALLOW_PROTOCOL'] ||= DEFAULT_GIT_ALLOW_PROTOCOL
  clone(url, options[:branch], path, options[:config])
end

#store_updated_repo(url, mirror, repo_name, fail_hard) ⇒ Object

Creates or updates the mirror repo then stores an indication that this repo has been updated on this run of fastclone



343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# File 'lib/git-fastclone.rb', line 343

def store_updated_repo(url, mirror, repo_name, fail_hard)
  unless Dir.exist?(mirror)
    fail_on_error('git', 'clone', verbose ? '--verbose' : '--quiet', '--mirror', url.to_s, mirror.to_s,
                  quiet: !verbose, print_on_failure: print_git_errors)
  end

  cmd = ['git', 'remote', verbose ? '--verbose' : nil, 'update', '--prune'].compact
  fail_on_error(*cmd, quiet: !verbose, print_on_failure: print_git_errors, chdir: mirror)

  reference_updated[repo_name] = true
rescue RunnerExecutionRuntimeError => e
  # To avoid corruption of the cache, if we failed to update or check out we remove
  # the cache directory entirely. This may cause the current clone to fail, but if the
  # underlying error from git is transient it will not affect future clones.
  clear_cache(mirror, url)
  raise e if fail_hard
end

#thread_update_submodule(submodule_url, submodule_path, threads, pwd) ⇒ Object



267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/git-fastclone.rb', line 267

def thread_update_submodule(submodule_url, submodule_path, threads, pwd)
  threads << Thread.new do
    with_git_mirror(submodule_url) do |mirror, _|
      cmd = ['git', 'submodule',
             verbose ? nil : '--quiet', 'update', '--reference', mirror.to_s, submodule_path.to_s].compact
      fail_on_error(*cmd, quiet: !verbose, print_on_failure: print_git_errors,
                          chdir: File.join(abs_clone_path, pwd))
    end

    update_submodules(File.join(pwd, submodule_path), submodule_url)
  end
end

#update_reference_repo(url, fail_hard) ⇒ Object

Fail_hard indicates whether the update is considered a failure of the overall checkout or not. When we pre-fetch based off of cached information, fail_hard is false. When we fetch based off info in a repository directly, fail_hard is true.



317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# File 'lib/git-fastclone.rb', line 317

def update_reference_repo(url, fail_hard)
  repo_name = reference_repo_name(url)
  mirror = reference_repo_dir(url, reference_dir, using_local_repo)

  with_reference_repo_lock(url) do
    # we've created this to track submodules' history
    submodule_file = reference_repo_submodule_file(url, reference_dir, using_local_repo)

    # if prefetch is on, then grab children immediately to frontload network requests
    prefetch(submodule_file) if File.exist?(submodule_file) && prefetch_submodules

    # Store the fact that our repo has been updated if necessary
    store_updated_repo(url, mirror, repo_name, fail_hard) unless reference_updated[repo_name]
  end
end

#update_submodule_reference(url, submodule_url_list) ⇒ Object



302
303
304
305
306
307
308
309
310
311
# File 'lib/git-fastclone.rb', line 302

def update_submodule_reference(url, submodule_url_list)
  return if submodule_url_list.empty? || prefetch_submodules.nil?

  with_reference_repo_lock(url) do
    # Write the dependency file using submodule list
    File.open(reference_repo_submodule_file(url, reference_dir, using_local_repo), 'w') do |f|
      submodule_url_list.each { |submodule_url| f.write("#{submodule_url}\n") }
    end
  end
end

#update_submodules(pwd, url) ⇒ Object



245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/git-fastclone.rb', line 245

def update_submodules(pwd, url)
  return unless File.exist?(File.join(abs_clone_path, pwd, '.gitmodules'))

  puts 'Updating submodules...' if verbose

  threads = []
  submodule_url_list = []
  output = fail_on_error('git', 'submodule', 'init', quiet: !verbose,
                                                     print_on_failure: print_git_errors,
                                                     chdir: File.join(abs_clone_path, pwd))

  output.split("\n").each do |line|
    submodule_path, submodule_url = parse_update_info(line)
    submodule_url_list << submodule_url

    thread_update_submodule(submodule_url, submodule_path, threads, pwd)
  end

  update_submodule_reference(url, submodule_url_list)
  threads.each(&:join)
end

#usageObject



423
424
425
# File 'lib/git-fastclone.rb', line 423

def usage
  'Usage: git fastclone [options] <git-url> [path]'
end

#with_git_mirror(url) ⇒ Object

This command will create and bring the mirror up-to-date on-demand, blocking any code passed in while the mirror is brought up-to-date

In future we may need to synchronize with flock here if we run multiple builds at once against the same reference repos. One build per slave at the moment means we only need to synchronize our own threads in case a single submodule url is included twice via multiple dependency paths



395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
# File 'lib/git-fastclone.rb', line 395

def with_git_mirror(url)
  retries_allowed ||= 1
  attempt_number ||= 0

  update_reference_repo(url, true)
  dir = reference_repo_dir(url, reference_dir, using_local_repo)

  # Sometimes remote updates involve re-packing objects on a different thread
  # We grab the reference repo lock here just to make sure whatever thread
  # ended up doing the update is done with its housekeeping.
  # This makes sure we have control and unlock when the block returns:
  with_reference_repo_lock(url) do
    yield dir, attempt_number
  end
rescue RunnerExecutionRuntimeError => e
  if retriable_error?(e.output)
    print_formatted_error(e.output)
    clear_cache(dir, url)

    if attempt_number < retries_allowed
      attempt_number += 1
      retry
    end
  end

  raise e
end

#with_reference_repo_lock(url, &block) ⇒ Object



280
281
282
283
284
285
286
287
288
289
290
291
292
# File 'lib/git-fastclone.rb', line 280

def with_reference_repo_lock(url, &block)
  # Sane POSIX implementations remove exclusive flocks when a process is terminated or killed
  # We block here indefinitely. Waiting for other git-fastclone processes to release the lock.
  # With the default timeout of 0 we will wait forever, this can be overridden on the command line.
  lockfile = reference_repo_lock_file(url, reference_dir, using_local_repo)
  Timeout.timeout(flock_timeout_secs) { lockfile.flock(File::LOCK_EX) }
  with_reference_repo_thread_lock(url, &block)
ensure
  # Not strictly necessary to do this unlock as an ensure. If ever exception is caught outside this
  # primitive, ensure protection may come in handy.
  lockfile.flock(File::LOCK_UN)
  lockfile.close
end

#with_reference_repo_thread_lock(url, &block) ⇒ Object



294
295
296
297
298
299
300
# File 'lib/git-fastclone.rb', line 294

def with_reference_repo_thread_lock(url, &block)
  # We also need thread level locking because pre-fetch means multiple threads can
  # attempt to update the same repository from a single git-fastclone process
  # file locks in posix are tracked per process, not per userland thread.
  # This gives us the equivalent of pthread_mutex around these accesses.
  reference_mutex[reference_repo_name(url)].synchronize(&block)
end