Class: SwitchmanInstJobs::JobsMigrator

Inherits:

Object

Object
SwitchmanInstJobs::JobsMigrator

show all

Defined in:: lib/switchman_inst_jobs/jobs_migrator.rb

Class Method Summary collapse

.add_before_move_callback(proc) ⇒ Object
.clear_shard_cache(debug_message = nil) ⇒ Object
.ensure_unblock_stranded_for(shards) ⇒ Object

if :migrate_strands ran on any shards that fell into scenario 1, then block_stranded never got flipped, so do that now.
.migrate_everything ⇒ Object
.migrate_shards(shard_map) ⇒ Object
.migrate_strands ⇒ Object
.run ⇒ Object

This method expects that all relevant shards already have block_stranded: true but otherwise jobs can be running normally.
.transaction_on(shards, &block) ⇒ Object
.unblock_strands(target_shard) ⇒ Object

Class Method Details

.add_before_move_callback(proc) ⇒ `Object`

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 7

def add_before_move_callback(proc)
  @before_move_callbacks ||= []
  @before_move_callbacks << proc
end

.clear_shard_cache(debug_message = nil) ⇒ `Object`

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 73

def clear_shard_cache(debug_message = nil)
  ::Switchman.cache.clear
  Rails.logger.debug("Waiting for caches to clear #{debug_message}")
  # Wait a little over the 60 second in-process shard cache clearing
  # threshold to ensure that all new stranded jobs are now being
  # enqueued with next_in_strand: false
  # @skip_cache_wait is for spec usage only
  sleep(65) unless @skip_cache_wait
end

.ensure_unblock_stranded_for(shards) ⇒ `Object`

if :migrate_strands ran on any shards that fell into scenario 1, then block_stranded never got flipped, so do that now.

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 60

def ensure_unblock_stranded_for(shards)
  shards = ::Switchman::Shard.where(id: shards, block_stranded: true).to_a
  return unless shards.any?

  ::Switchman::Shard.where(id: shards).update_all(block_stranded: false)
  clear_shard_cache

  # shards is an array of shard objects that is now stale cause block_stranded has been updated.
  shards.map(&:delayed_jobs_shard).uniq.each do |dj_shard|
    unblock_strands(dj_shard)
  end
end

.migrate_everything ⇒ `Object`

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 188

def migrate_everything
  source_shard = ::Switchman::Shard.current(:delayed_jobs)
  scope = ::Delayed::Job.shard(source_shard).where('strand IS NULL')

  shard_map = build_shard_map(scope, source_shard)
  shard_map.each do |(target_shard, source_shard_ids)|
    batch_move_jobs(
      target_shard: target_shard,
      source_shard: source_shard,
      scope: scope.where(shard_id: source_shard_ids).where(locked_by: nil)
    )
  end
end

.migrate_shards(shard_map) ⇒ `Object`

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 26

def migrate_shards(shard_map)
  source_shards = Set[]
  target_shards = Hash.new([])
  shard_map.each do |(shard, target_shard)|
    shard = ::Switchman::Shard.find(shard) unless shard.is_a?(::Switchman::Shard)
    source_shards << shard.delayed_jobs_shard.id
    target_shard = target_shard.try(:id) || target_shard
    target_shards[target_shard] += [shard.id]
  end

  # Do the updates in batches and then just clear redis instead of clearing them one at a time
  target_shards.each do |target_shard, shards|
    updates = { delayed_jobs_shard_id: target_shard, block_stranded: true }
    updates[:updated_at] = Time.zone.now if ::Switchman::Shard.column_names.include?('updated_at')
    ::Switchman::Shard.where(id: shards).update_all(updates)
  end
  clear_shard_cache

  ::Switchman::Shard.clear_cache
  # rubocop:disable Style/CombinableLoops
  # We first migrate strands so that we can stop blocking strands before we migrate unstranded jobs
  source_shards.each do |s|
    ::Switchman::Shard.lookup(s).activate(:delayed_jobs) { migrate_strands }
  end

  source_shards.each do |s|
    ::Switchman::Shard.lookup(s).activate(:delayed_jobs) { migrate_everything }
  end
  ensure_unblock_stranded_for(shard_map.map(&:first))
  # rubocop:enable Style/CombinableLoops
end

.migrate_strands ⇒ `Object`

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 92

def migrate_strands
  # there are 4 scenarios to deal with here
  # 1) no running job, no jobs moved: do nothing
  # 2) running job, no jobs moved; create blocker with next_in_strand=false
  #    to prevent new jobs from immediately executing
  # 3) running job, jobs moved; set next_in_strand=false on the first of
  #    those (= do nothing since it should already be false)
  # 4) no running job, jobs moved: set next_in_strand=true on the first of
  #    those (= do nothing since it should already be true)

  source_shard = ::Switchman::Shard.current(:delayed_jobs)
  strand_scope = ::Delayed::Job.shard(source_shard).where.not(strand: nil)
  shard_map = build_shard_map(strand_scope, source_shard)
  shard_map.each do |(target_shard, source_shard_ids)|
    shard_scope = strand_scope.where(shard_id: source_shard_ids)

    # 1) is taken care of because it should not show up here in strands
    strands = shard_scope.distinct.order(:strand).pluck(:strand)

    target_shard.activate(:delayed_jobs) do
      strands.each do |strand|
        transaction_on([source_shard, target_shard]) do
          this_strand_scope = shard_scope.where(strand: strand)
          # we want to copy all the jobs except the one that is still running.
          jobs_scope = this_strand_scope.where(locked_by: nil)

          # 2) and part of 3) are taken care of here by creating a blocker
          # job with next_in_strand = false. as soon as the current
          # running job is finished it should set next_in_strand
          # We lock it to ensure that the jobs worker can't delete it until we are done moving the strand
          # Since we only unlock it on the new jobs queue *after* deleting from the original
          # the lock ensures the blocker always gets unlocked
          first = this_strand_scope.where.not(locked_by: nil).next_in_strand_order.lock.first
          if first
            first_job = ::Delayed::Job.create!(strand: strand, next_in_strand: false)
            first_job.payload_object = ::Delayed::PerformableMethod.new(Kernel, :sleep, args: [0])
            first_job.queue = first.queue
            first_job.tag = 'Kernel.sleep'
            first_job.source = 'JobsMigrator::StrandBlocker'
            first_job.max_attempts = 1
            # If we ever have jobs left over from 9999 jobs moves of a single shard,
            # something has gone terribly wrong
            first_job.strand_order_override = -9999
            first_job.save!
            # the rest of 3) is taken care of here
            # make sure that all the jobs moved over are NOT next in strand
            ::Delayed::Job.where(next_in_strand: true, strand: strand, locked_by: nil).
              update_all(next_in_strand: false)
          end

          # 4) is taken care of here, by leaving next_in_strand alone and
          # it should execute on the new shard
          batch_move_jobs(
            target_shard: target_shard,
            source_shard: source_shard,
            scope: jobs_scope
          ) do |job, new_job|
            # This ensures jobs enqueued on the old jobs shard run before jobs on the new jobs queue
            new_job.strand_order_override = job.strand_order_override - 1
          end
        end
      end

      updated = ::Switchman::Shard.where(id: source_shard_ids, block_stranded: true).
        update_all(block_stranded: false)
      # If this is being manually re-run for some reason to clean something up, don't wait for nothing to happen
      clear_shard_cache("(#{source_shard.id} -> #{target_shard.id})") unless updated.zero?

      ::Switchman::Shard.clear_cache
      # At this time, let's unblock all the strands on the target shard that aren't being held by a blocker
      # but actually could have run and we just didn't know it because we didn't know if they had jobs
      # on the source shard
      unblock_strands(target_shard)
    end
  end
end

.run ⇒ `Object`

This method expects that all relevant shards already have block_stranded: true but otherwise jobs can be running normally

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 85

def run
  # Ensure this is never run with a dirty in-memory shard cache
  ::Switchman::Shard.clear_cache
  migrate_strands
  migrate_everything
end

.transaction_on(shards, &block) ⇒ `Object`

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 12

def transaction_on(shards, &block)
  return yield if shards.empty?

  shard = shards.pop
  current_shard = ::Switchman::Shard.current(:delayed_jobs)
  shard.activate(:delayed_jobs) do
    ::Delayed::Job.transaction do
      current_shard.activate(:delayed_jobs) do
        transaction_on(shards, &block)
      end
    end
  end
end

.unblock_strands(target_shard) ⇒ `Object`

# File 'lib/switchman_inst_jobs/jobs_migrator.rb', line 169

def unblock_strands(target_shard)
  target_shard.activate(:delayed_jobs) do
    loop do
      # We only want to unlock stranded jobs where they don't belong to a blocked shard (if they *do* belong)
      # to a blocked shard, they must be part of a concurrent jobs migration from a different source shard to
      # this target shard, so we shouldn't unlock them yet.  We only ever unlock one job here to keep the
      # logic cleaner; if the job is n-stranded, after the first one runs, the trigger will unlock larger
      # batches
      break if ::Delayed::Job.where(id: ::Delayed::Job.select('DISTINCT ON (strand) id').
        where.not(strand: nil).
        where.not(shard_id: ::Switchman::Shard.where(block_stranded: true).pluck(:id)).where(
          ::Delayed::Job.select(1).from("#{::Delayed::Job.quoted_table_name} dj2").
          where("dj2.next_in_strand = true OR dj2.source = 'JobsMigrator::StrandBlocker'").
          where('dj2.strand = delayed_jobs.strand').arel.exists.not
        ).order(:strand, :strand_order_override, :id)).limit(500).update_all(next_in_strand: true).zero?
    end
  end
end

Class: SwitchmanInstJobs::JobsMigrator

Class Method Summary collapse

Class Method Details

.add_before_move_callback(proc) ⇒ Object

.clear_shard_cache(debug_message = nil) ⇒ Object

.ensure_unblock_stranded_for(shards) ⇒ Object

.migrate_everything ⇒ Object

.migrate_shards(shard_map) ⇒ Object

.migrate_strands ⇒ Object

.run ⇒ Object

.transaction_on(shards, &block) ⇒ Object

.unblock_strands(target_shard) ⇒ Object