Class: RightScraper::Retrievers::Git

Inherits:
CheckoutBasedRetriever show all
Defined in:
lib/right_scraper/retrievers/git.rb

Overview

Retriever for resources stored in a git repository.

Constant Summary collapse

@@available =
false

Instance Attribute Summary

Attributes inherited from Base

#max_bytes, #max_seconds, #repo_dir, #repository

Instance Method Summary collapse

Methods inherited from Base

#initialize, repo_dir

Constructor Details

This class inherits a constructor from RightScraper::Retrievers::Base

Instance Method Details

#available?Boolean

Determines if downloader is available.

Returns:

  • (Boolean)


78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/right_scraper/retrievers/git.rb', line 78

def available?
  unless @@available
    begin
      # note that require 'git' does the same version check on load but
      # we don't want to assume any particular implementation.
      #
      # FIX: we might want to parse the result and require a minimum git
      # client version.
      cmd = "git --version"
      `#{cmd}`
      if $?.success?
        @@available = true
      else
        raise RetrieverError, "\"#{cmd}\" exited with #{$?.exitstatus}"
      end
    rescue
      @logger.note_error($!, :available, "git retriever is unavailable")
    end
  end
  @@available
end

#branch?(git, name) ⇒ Boolean

Returns:

  • (Boolean)


247
248
249
# File 'lib/right_scraper/retrievers/git.rb', line 247

def branch?(git, name)
  git.branches.find {|t| t.name == name}
end

#do_checkoutObject

Clone the remote repository. The operations are as follows:

  • clone repository to @repo_dir

  • checkout #tag

  • update @repository#tag



211
212
213
214
215
216
217
218
219
220
221
# File 'lib/right_scraper/retrievers/git.rb', line 211

def do_checkout
  super
  git = @logger.operation(:cloning, "to #{@repo_dir}") do
    without_host_key_checking do
      ::Git.clone(@repository.url, @repo_dir)
    end
  end
  do_fetch(git)
  do_checkout_revision(git)
  do_update_tag git
end

#do_checkout_revision(git) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/right_scraper/retrievers/git.rb', line 223

def do_checkout_revision(git)
  @logger.operation(:checkout_revision) do
    case
    when tag?(git, repo_tag) && branch?(git, repo_tag) then
      raise "Ambiguous reference: '#{repo_tag}' denotes both a branch and a tag"
    when branch = find_remote_branch(git, repo_tag) then
      branch.checkout
    when branch = find_local_branch(git, repo_tag) then
      branch.checkout
    else
      git.checkout(repo_tag)
    end
  end if repo_tag
end

#do_fetch(git) ⇒ Object



172
173
174
175
176
177
# File 'lib/right_scraper/retrievers/git.rb', line 172

def do_fetch(git)
  @logger.operation(:fetch) do
    git.tags.each {|tag| git.lib.tag(['-d', tag.name])}
    git.fetch(['--all', '--prune', '--tags'])
  end
end

#do_updateObject

Incrementally update the checkout. The operations are as follows:

  • checkout #tag

  • if #tag is the head of a branch:

    • find that branch’s remote

    • fetch it

    • merge changes

    • update @repository#tag

Note that if #tag is a SHA revision or a tag that exists in the current repository, no fetching is done.



188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/right_scraper/retrievers/git.rb', line 188

def do_update
  git = ::Git.open(@repo_dir)
  do_fetch(git)
  @logger.operation(:cleanup, "ensure no untracked files in #{@repo_dir}") do
    git.reset_hard
    Dir.chdir(@repo_dir) do
      # ignore outcome; there is no way to record 'warnings'
      system("git clean -f")
    end
  end
  do_checkout_revision(git)
  do_update_tag(git)
end

#do_update_tag(git) ⇒ Object



202
203
204
205
# File 'lib/right_scraper/retrievers/git.rb', line 202

def do_update_tag(git)
  @repository = @repository.clone
  @repository.tag = git.gtree("HEAD").sha
end

#exists?Boolean

Return true if a checkout exists. Currently tests for .git in the checkout.

Returns ===

Boolean

true if the checkout already exists (and thus incremental updating can occur).

Returns:

  • (Boolean)


168
169
170
# File 'lib/right_scraper/retrievers/git.rb', line 168

def exists?
  File.exists?(File.join(@repo_dir, '.git'))
end

#find_branch(git, tag) ⇒ Object



257
258
259
# File 'lib/right_scraper/retrievers/git.rb', line 257

def find_branch(git, tag)
  find_local_branch(git, tag) || find_remote_branch(git, tag)
end

#find_local_branch(git, name) ⇒ Object



261
262
263
# File 'lib/right_scraper/retrievers/git.rb', line 261

def find_local_branch(git, name)
  git.branches.local.find {|b| b.name == name}
end

#find_remote_branch(git, name) ⇒ Object



265
266
267
# File 'lib/right_scraper/retrievers/git.rb', line 265

def find_remote_branch(git, name)
  git.branches.remote.find {|b| b.name == name}
end

#ignorable_pathsObject

Ignore .git directories.



239
240
241
# File 'lib/right_scraper/retrievers/git.rb', line 239

def ignorable_paths
  ['.git']
end

#repo_tagObject



251
252
253
254
255
# File 'lib/right_scraper/retrievers/git.rb', line 251

def repo_tag
  name = (@repository.tag || "master").chomp
  name = "master" if name.empty?
  name
end

#retrieveObject

In addition to normal retriever initialization, if the underlying repository has a credential we need to initialize a fresh SSHAgent and add the credential to it.

Raises:



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/right_scraper/retrievers/git.rb', line 103

def retrieve
  raise RetrieverError.new("git retriever is unavailable") unless available?

  start_time = nil
  end_time = nil
  RightScraper::Processes::SSHAgent.with do |agent|
    unless @repository.first_credential.nil? || @repository.first_credential.empty?
      agent.add_key(@repository.first_credential)
    end
    start_time = ::Time.now
    super
    end_time = ::Time.now
  end

  # TEAL FIX: the use of blackwinter-git has defeated the logic that
  # ensured the max bytes was not exceeded during checkout. we will need
  # to replace blackwinter-git in future but in the interim our only
  # solution is to warn the user after the checkout has completed that we
  # are going to restrict their repo size/time in an upcoming release.
  if size_limit_exceeded?
    message =
      "The size of the downloaded repository exceeded a soft limit of" +
      " #{@max_bytes / (1024 * 1024)} MB. This will become a hard limit" +
      " in an upcoming release. You may avoid retrieval failure by" +
      " moving some of your files to seperate repositories."
    @logger.note_warning(message)
  end
  if @max_seconds && (end_time >= start_time + @max_seconds)
    message =
      "The time to download the repository exceeded a soft limit of" +
      " #{@max_seconds} seconds. This will become a hard limit" +
      " in an upcoming release. You may avoid retrieval failure by" +
      " moving some of your files to seperate repositories."
    @logger.note_warning(message)
  end
  true
end

#size_limit_exceeded?TrueClass|FalseClass

Determines if total size of files created by child process has exceeded the limit specified, if any.

Return

Returns:

  • (TrueClass|FalseClass)

    true if size limit exceeded



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/right_scraper/retrievers/git.rb', line 146

def size_limit_exceeded?
  exceeded = false
  if @max_bytes
    globbie = ::File.join(@repo_dir, '**/*')
    size = 0
    ::Dir.glob(globbie) do |f|
      size += ::File.stat(f).size rescue 0 if ::File.file?(f)
      if size > @max_bytes
        exceeded = true
        break
      end
    end
  end
  exceeded
end

#tag?(git, name) ⇒ Boolean

Returns:

  • (Boolean)


243
244
245
# File 'lib/right_scraper/retrievers/git.rb', line 243

def tag?(git, name)
  git.tags.find {|t| t.name == name}
end

#without_host_key_checking { ... } ⇒ Object

Temporarily disable SSH host-key checking for SSH clients invoked by Git, for the duration of the block that is passed to this method.

Yields:

  • after disabling strict host key checking, yields to caller



273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# File 'lib/right_scraper/retrievers/git.rb', line 273

def without_host_key_checking
  tmpdir = Dir.mktmpdir
  ssh_cmd = File.join(tmpdir, 'ssh')

  File.open(ssh_cmd, 'w') do |cmd|
    cmd.puts "#!/bin/bash"
    cmd.puts "exec ssh -o StrictHostKeyChecking=no ${@}"
  end
  FileUtils.chmod(0700, ssh_cmd)

  old_env = ENV['GIT_SSH']
  ENV['GIT_SSH'] = ssh_cmd

  result = yield
ensure
  FileUtils.rm_rf(tmpdir)
  ENV['GIT_SSH'] = old_env

  result
end