Module: Einhorn::Command

Defined in:: lib/einhorn/command.rb,
lib/einhorn/command/interface.rb

Defined Under Namespace

Class Method Summary collapse

Class Method Details

.cull ⇒ `Object`

# File 'lib/einhorn/command.rb', line 421

def self.cull
  acked = Einhorn::WorkerPool.ack_count
  unsignaled = Einhorn::WorkerPool.unsignaled_count
  target = Einhorn::WorkerPool.ack_target

  if Einhorn::State.upgrading && acked >= target
    Einhorn::State.upgrading = false
    Einhorn.log_info("Upgraded successfully to version #{Einhorn::State.version} (Einhorn #{Einhorn::VERSION}).", :upgrade)
    Einhorn.send_tagged_message(:upgrade, "Upgrade done", true)
  end

  old_workers = Einhorn::WorkerPool.old_workers
  Einhorn.log_debug("#{acked} acked, #{unsignaled} unsignaled, #{target} target, #{old_workers.length} old workers")
  if !Einhorn::State.upgrading && old_workers.length > 0
    Einhorn.log_info("Killing off #{old_workers.length} old workers.", :upgrade)
    signal_all("USR2", old_workers)
  elsif Einhorn::State.upgrading && Einhorn::State.smooth_upgrade
    # In a smooth upgrade, kill off old workers one by one when we have
    # sufficiently many new workers.
    excess = (old_workers.length + acked) - target
    if excess > 0
      Einhorn.log_info("Smooth upgrade: killing off #{excess} old workers.", :upgrade)
      signal_all("USR2", old_workers.take(excess))
    else
      Einhorn.log_debug("Not killing old workers, as excess is #{excess}.")
    end
  end

  if unsignaled > target
    excess = Einhorn::WorkerPool.unsignaled_modern_workers_with_priority[0...(unsignaled-target)]
    Einhorn.log_info("Have too many workers at the current version, so killing off #{excess.length} of them.")
    signal_all("USR2", excess)
  end
end

.decrement ⇒ `Object`

# File 'lib/einhorn/command.rb', line 153

def self.decrement
  if Einhorn::State.config[:number] <= 1
    output = "Can't decrease number of workers (already at #{Einhorn::State.config[:number]}).  Run kill #{$$} if you really want to kill einhorn."
    $stderr.puts(output)
    return output
  end

  Einhorn::Event.break_loop
  old = Einhorn::State.config[:number]
  new = (Einhorn::State.config[:number] -= 1)
  output = "Decrementing number of workers from #{old} -> #{new}"
  $stderr.puts(output)
  output
end

.dumpable_state ⇒ `Object`

# File 'lib/einhorn/command.rb', line 186

def self.dumpable_state
  global_state = Einhorn::State.dumpable_state
  descriptor_state = Einhorn::Event.persistent_descriptors.map do |descriptor|
    descriptor.to_state
  end

  {
    :state => global_state,
    :persistent_descriptors => descriptor_state,
  }
end

.full_upgrade(options = {}) ⇒ `Object`

Parameters:

options (Hash) (defaults to: {})

Options Hash (options):

:smooth (Boolean) — default: false —

Whether to perform a smooth or fleet upgrade. In a smooth upgrade, bring up new workers and cull old workers one by one as soon as there is a replacement. In a fleet upgrade, bring up all the new workers and don’t cull any old workers until they’re all up.

# File 'lib/einhorn/command.rb', line 378

def self.full_upgrade(options={})
  options = {:smooth => false}.merge(options)

  Einhorn::State.smooth_upgrade = options.fetch(:smooth)
  reload_for_upgrade
end

.full_upgrade_fleet ⇒ `Object`



388
389
390

# File 'lib/einhorn/command.rb', line 388

def self.full_upgrade_fleet
  full_upgrade(:smooth => false)
end

.full_upgrade_smooth ⇒ `Object`



385
386
387

# File 'lib/einhorn/command.rb', line 385

def self.full_upgrade_smooth
  full_upgrade(:smooth => true)
end

.increment ⇒ `Object`

# File 'lib/einhorn/command.rb', line 144

def self.increment
  Einhorn::Event.break_loop
  old = Einhorn::State.config[:number]
  new = (Einhorn::State.config[:number] += 1)
  output = "Incrementing number of workers from #{old} -> #{new}"
  $stderr.puts(output)
  output
end

.louder(log = true) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 538

def self.louder(log=true)
  Einhorn::State.verbosity -= 1 if Einhorn::State.verbosity > 0
  output = "Verbosity set to #{Einhorn::State.verbosity}"
  Einhorn.log_info(output) if log
  output
end

.mourn(pid) ⇒ `Object`

Mourn the death of your child

# File 'lib/einhorn/command.rb', line 24

def self.mourn(pid)
  unless spec = Einhorn::State.children[pid]
    Einhorn.log_error("Could not find any config for exited child #{pid.inspect}! This probably indicates a bug in Einhorn.")
    return
  end

  Einhorn::State.children.delete(pid)

  # Unacked worker
  if spec[:type] == :worker && !spec[:acked]
    Einhorn::State.consecutive_deaths_before_ack += 1
    extra = ' before it was ACKed'
  else
    extra = nil
  end

  case type = spec[:type]
  when :worker
    Einhorn.log_info("===> Exited worker #{pid.inspect}#{extra}", :upgrade)
  else
    Einhorn.log_error("===> Exited process #{pid.inspect} has unrecgonized type #{type.inspect}: #{spec.inspect}", :upgrade)
  end
end

.next_index ⇒ `Object`

# File 'lib/einhorn/command.rb', line 249

def self.next_index
  all_indexes = Set.new(Einhorn::State.children.map { |k, st| st[:index] })
  0.upto(all_indexes.length) do |i|
    return i unless all_indexes.include?(i)
  end
end

.prepare_child_environment(index) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 314

def self.prepare_child_environment(index)
  # This is run from the child
  ENV['EINHORN_MASTER_PID'] = Process.ppid.to_s
  ENV['EINHORN_SOCK_PATH'] = Einhorn::Command::Interface.socket_path
  if Einhorn::State.command_socket_as_fd
    socket = UNIXSocket.open(Einhorn::Command::Interface.socket_path)
    Einhorn::TransientState.socket_handles << socket
    ENV['EINHORN_SOCK_FD'] = socket.fileno.to_s
  end

  ENV['EINHORN_FD_COUNT'] = Einhorn::State.bind_fds.length.to_s
  Einhorn::State.bind_fds.each_with_index {|fd, i| ENV["EINHORN_FD_#{i}"] = fd.to_s}

  ENV['EINHORN_CHILD_INDEX'] = index.to_s

  # EINHORN_FDS is deprecated. It was originally an attempt to
  # match Upstart's nominal internal support for space-separated
  # FD lists, but nobody uses that in practice, and it makes
  # finding individual FDs more difficult
  ENV['EINHORN_FDS'] = Einhorn::State.bind_fds.map(&:to_s).join(' ')
end

.prepare_child_process ⇒ `Object`



366
367
368

# File 'lib/einhorn/command.rb', line 366

def self.prepare_child_process
  Einhorn.renice_self
end

.quieter(log = true) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 531

def self.quieter(log=true)
  Einhorn::State.verbosity += 1 if Einhorn::State.verbosity < 2
  output = "Verbosity set to #{Einhorn::State.verbosity}"
  Einhorn.log_info(output) if log
  output
end

.reap ⇒ `Object`

# File 'lib/einhorn/command.rb', line 9

def self.reap
  begin
    while true
      Einhorn.log_debug('Going to reap a child process')

      pid = Process.wait(-1, Process::WNOHANG)
      return unless pid
      mourn(pid)
      Einhorn::Event.break_loop
    end
  rescue Errno::ECHILD
  end
end

.register_ack(pid) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 75

def self.register_ack(pid)
  unless spec = Einhorn::State.children[pid]
    Einhorn.log_error("Could not find state for PID #{pid.inspect}; ignoring ACK.")
    return
  end

  if spec[:acked]
    Einhorn.log_error("Pid #{pid.inspect} already ACKed; ignoring new ACK.")
    return
  end

  if Einhorn::State.consecutive_deaths_before_ack > 0
    extra = ", breaking the streak of #{Einhorn::State.consecutive_deaths_before_ack} consecutive unacked workers dying"
  else
    extra = nil
  end
  Einhorn::State.consecutive_deaths_before_ack = 0

  spec[:acked] = true
  Einhorn.log_info("Up to #{Einhorn::WorkerPool.ack_count} / #{Einhorn::WorkerPool.ack_target} #{Einhorn::State.ack_mode[:type]} ACKs#{extra}")
  # Could call cull here directly instead, I believe.
  Einhorn::Event.break_loop
end

.register_manual_ack(pid) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 48

def self.register_manual_ack(pid)
  ack_mode = Einhorn::State.ack_mode
  unless ack_mode[:type] == :manual
    Einhorn.log_error("Received a manual ACK for #{pid.inspect}, but ack_mode is #{ack_mode.inspect}. Ignoring ACK.")
    return
  end
  Einhorn.log_info("Received a manual ACK from #{pid.inspect}")
  register_ack(pid)
end

.register_timer_ack(time, pid) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 58

def self.register_timer_ack(time, pid)
  ack_mode = Einhorn::State.ack_mode
  unless ack_mode[:type] == :timer
    Einhorn.log_error("Received a timer ACK for #{pid.inspect}, but ack_mode is #{ack_mode.inspect}. Ignoring ACK.")
    return
  end

  unless Einhorn::State.children[pid]
    # TODO: Maybe cancel pending ACK timers upon death?
    Einhorn.log_debug("Worker #{pid.inspect} died before its timer ACK happened.")
    return
  end

  Einhorn.log_info("Worker #{pid.inspect} has been up for #{time}s, so we are considering it alive.")
  register_ack(pid)
end

.reload ⇒ `Object`

# File 'lib/einhorn/command.rb', line 198

def self.reload
  unless Einhorn::State.respawn
    Einhorn.log_info("Not reloading einhorn because we're exiting")
    return
  end

  Einhorn.log_info("Reloading einhorn master (#{Einhorn::TransientState.script_name})...", :reload)

  # In case there's anything lurking
  $stdout.flush

  # Spawn a child to pass the state through the pipe
  read, write = Einhorn::Compat.pipe

  fork do
    Einhorn::TransientState.whatami = :state_passer
    Einhorn::State.generation += 1
    read.close

    begin
      write.write(YAML.dump(dumpable_state))
    rescue Errno::EPIPE => e
      e.message << " (state worker could not write state, which likely means the parent process has died)"
      raise e
    end
    write.close

    exit(0)
  end
  write.close

  unless Einhorn.can_safely_reload?
    Einhorn.log_error("Can not initiate einhorn master reload safely, aborting", :reload)
    Einhorn::State.reloading_for_upgrade = false
    read.close
    return
  end

  begin
    Einhorn.initialize_reload_environment
    respawn_commandline = Einhorn.upgrade_commandline(['--with-state-fd', read.fileno.to_s])
    respawn_commandline << { :close_others => false }
    Einhorn.log_info("About to re-exec einhorn master as #{respawn_commandline.inspect}", :reload)
    Einhorn::Compat.exec(*respawn_commandline)
  rescue SystemCallError => e
    Einhorn.log_error("Could not reload! Attempting to continue. Error was: #{e}", :reload)
    Einhorn::State.reloading_for_upgrade = false
    read.close
  end
end

.reload_for_upgrade ⇒ `Object`

# File 'lib/einhorn/command.rb', line 392

def self.reload_for_upgrade
  Einhorn::State.reloading_for_upgrade = true
  reload
end

.replenish ⇒ `Object`

# File 'lib/einhorn/command.rb', line 461

def self.replenish
  return unless Einhorn::State.respawn

  if !Einhorn::State.last_spinup
    replenish_immediately
  else
    replenish_gradually
  end
end

.replenish_gradually(max_unacked = nil) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 481

def self.replenish_gradually(max_unacked=nil)
  return if Einhorn::TransientState.has_outstanding_spinup_timer
  return unless Einhorn::WorkerPool.missing_worker_count > 0

  # default to spinning up at most NCPU workers at once
  unless max_unacked
    begin
      @processor_count ||= Einhorn::Compat.processor_count
    rescue => err
      Einhorn.log_error(err.inspect)
      @processor_count = 1
    end
    max_unacked = @processor_count
  end

  if max_unacked <= 0
    raise ArgumentError.new("max_unacked must be positive")
  end

  # Exponentially backoff automated spinup if we're just having
  # things die before ACKing
  spinup_interval = Einhorn::State.config[:seconds] * (1.5 ** Einhorn::State.consecutive_deaths_before_ack)
  seconds_ago = (Time.now - Einhorn::State.last_spinup).to_f

  if seconds_ago > spinup_interval
    unacked = Einhorn::WorkerPool.unacked_unsignaled_modern_workers.length
    if unacked >= max_unacked
      Einhorn.log_debug("There are #{unacked} unacked new workers, and max_unacked is #{max_unacked}, so not spinning up a new process")
    else
      msg = "Last spinup was #{seconds_ago}s ago, and spinup_interval is #{spinup_interval}s, so spinning up a new process"

      if Einhorn::State.consecutive_deaths_before_ack > 0
        Einhorn.log_info("#{msg} (there have been #{Einhorn::State.consecutive_deaths_before_ack} consecutive unacked worker deaths)", :upgrade)
      else
        Einhorn.log_debug(msg)
      end

      spinup
    end
  else
    Einhorn.log_debug("Last spinup was #{seconds_ago}s ago, and spinup_interval is #{spinup_interval}s, so not spinning up a new process")
  end

  Einhorn::TransientState.has_outstanding_spinup_timer = true
  Einhorn::Event::Timer.open(spinup_interval) do
    Einhorn::TransientState.has_outstanding_spinup_timer = false
    replenish
  end
end

.replenish_immediately ⇒ `Object`

# File 'lib/einhorn/command.rb', line 471

def self.replenish_immediately
  missing = Einhorn::WorkerPool.missing_worker_count
  if missing <= 0
    Einhorn.log_error("Missing is currently #{missing.inspect}, but should always be > 0 when replenish_immediately is called. This probably indicates a bug in Einhorn.")
    return
  end
  Einhorn.log_info("Launching #{missing} new workers")
  missing.times {spinup}
end

.reseed_random ⇒ `Object`

Reseed common ruby random number generators.

OpenSSL::Random uses the PID to reseed after fork, which means that if a long-lived master process over its lifetime spawns two workers with the same PID, those workers will start with the same OpenSSL seed.

Ruby >= 1.9 has a guard against this in SecureRandom, but any direct users of OpenSSL::Random will still be affected.

Ruby 1.8 didn’t even reseed the default random number generator used by Kernel#rand in certain releases.

bugs.ruby-lang.org/issues/4579

# File 'lib/einhorn/command.rb', line 350

def self.reseed_random
  # reseed Kernel#rand
  srand

  # reseed OpenSSL::Random if it's loaded
  if defined?(OpenSSL::Random)
    if defined?(Random)
      seed = Random.new_seed
    else
      # Ruby 1.8
      seed = rand
    end
    OpenSSL::Random.seed(seed.to_s)
  end
end

.set_workers(new) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 168

def self.set_workers(new)
  if new == Einhorn::State.config[:number]
    return ""
  end

  Einhorn::Event.break_loop
  old = Einhorn::State.config[:number]
  Einhorn::State.config[:number] = new
  output = "Altering worker count, #{old} -> #{new}. Will "
  if old < new
    output << "spin up additional workers."
  else
    output << "gracefully terminate workers."
  end
  $stderr.puts(output)
  output
end

.signal_all(signal, children = nil, record = true) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 99

def self.signal_all(signal, children=nil, record=true)
  children ||= Einhorn::WorkerPool.workers

  signaled = []
  Einhorn.log_info("Sending #{signal} to #{children.inspect}", :upgrade)

  children.each do |child|
    unless spec = Einhorn::State.children[child]
      Einhorn.log_error("Trying to send #{signal} to dead child #{child.inspect}. The fact we tried this probably indicates a bug in Einhorn.", :upgrade)
      next
    end

    if record
      if spec[:signaled].include?(signal)
        Einhorn.log_error("Re-sending #{signal} to already-signaled child #{child.inspect}. It may be slow to spin down, or it may be swallowing #{signal}s.", :upgrade)
      end
      spec[:signaled].add(signal)
    end

    begin
      Process.kill(signal, child)
    rescue Errno::ESRCH
    else
      signaled << child
    end
  end

  if Einhorn::State.signal_timeout
    Einhorn::Event::Timer.open(Einhorn::State.signal_timeout) do
      children.each do |child|
        next unless spec = Einhorn::State.children[child]

        Einhorn.log_info("Child #{child.inspect} is still active after #{Einhorn::State.signal_timeout}. Sending SIGKILL.")
        begin
          Process.kill('KILL', child)
        rescue Errno::ESRCH
        end
        spec[:signaled].add('KILL')
      end
    end
  end

  "Successfully sent #{signal}s to #{signaled.length} processes: #{signaled.inspect}"
end

.spinup(cmd = nil) ⇒ `Object`

# File 'lib/einhorn/command.rb', line 256

def self.spinup(cmd=nil)
  cmd ||= Einhorn::State.cmd
  index = next_index
  if Einhorn::TransientState.preloaded
    pid = fork do
      Einhorn::TransientState.whatami = :worker
      prepare_child_process

      Einhorn.log_info('About to tear down Einhorn state and run einhorn_main')
      Einhorn::Command::Interface.uninit
      Einhorn::Event.close_all_for_worker
      Einhorn.set_argv(cmd, true)

      reseed_random

      prepare_child_environment(index)
      einhorn_main
    end
  else
    pid = fork do
      Einhorn::TransientState.whatami = :worker
      prepare_child_process

      Einhorn.log_info("About to exec #{cmd.inspect}")
      # Here's the only case where cloexec would help. Since we
      # have to track and manually close FDs for other cases, we
      # may as well just reuse close_all rather than also set
      # cloexec on everything.
      #
      # Note that Ruby 1.9's close_others option is useful here.
      Einhorn::Event.close_all_for_worker

      prepare_child_environment(index)
      Einhorn::Compat.exec(cmd[0], cmd[1..-1], :close_others => false)
    end
  end

  Einhorn.log_info("===> Launched #{pid} (index: #{index})", :upgrade)
  Einhorn::State.children[pid] = {
    :type => :worker,
    :version => Einhorn::State.version,
    :acked => false,
    :signaled => Set.new,
    :index => index
  }
  Einhorn::State.last_spinup = Time.now

  # Set up whatever's needed for ACKing
  ack_mode = Einhorn::State.ack_mode
  case type = ack_mode[:type]
  when :timer
    Einhorn::Event::ACKTimer.open(ack_mode[:timeout], pid)
  when :manual
  else
    Einhorn.log_error("Unrecognized ACK mode #{type.inspect}")
  end
end

.stop_respawning ⇒ `Object`

# File 'lib/einhorn/command.rb', line 456

def self.stop_respawning
  Einhorn::State.respawn = false
  Einhorn::Event.break_loop
end

.upgrade_workers ⇒ `Object`

# File 'lib/einhorn/command.rb', line 397

def self.upgrade_workers
  if Einhorn::State.upgrading
    Einhorn.log_info("Currently upgrading (#{Einhorn::WorkerPool.ack_count} / #{Einhorn::WorkerPool.ack_target} ACKs; bumping version and starting over)...", :upgrade)
  else
    Einhorn::State.upgrading = true
    u_type = Einhorn::State.smooth_upgrade ? 'smooth' : 'fleet'
    Einhorn.log_info("Starting #{u_type} upgrade from version" +
                     " #{Einhorn::State.version}...", :upgrade)
  end

  # Reset this, since we've just upgraded to a new universe (I'm
  # not positive this is the right behavior, but it's not
  # obviously wrong.)
  Einhorn::State.consecutive_deaths_before_ack = 0
  Einhorn::State.last_upgraded = Time.now

  Einhorn::State.version += 1
  if Einhorn::State.smooth_upgrade
    replenish_gradually
  else
    replenish_immediately
  end
end

Module: Einhorn::Command

Defined Under Namespace

Class Method Summary collapse

Class Method Details

.cull ⇒ Object

.decrement ⇒ Object

.dumpable_state ⇒ Object

.full_upgrade(options = {}) ⇒ Object

.full_upgrade_fleet ⇒ Object

.full_upgrade_smooth ⇒ Object

.increment ⇒ Object

.louder(log = true) ⇒ Object

.mourn(pid) ⇒ Object

.next_index ⇒ Object

.prepare_child_environment(index) ⇒ Object

.prepare_child_process ⇒ Object

.quieter(log = true) ⇒ Object

.reap ⇒ Object

.register_ack(pid) ⇒ Object

.register_manual_ack(pid) ⇒ Object

.register_timer_ack(time, pid) ⇒ Object

.reload ⇒ Object

.reload_for_upgrade ⇒ Object

.replenish ⇒ Object

.replenish_gradually(max_unacked = nil) ⇒ Object

.replenish_immediately ⇒ Object

.reseed_random ⇒ Object

.set_workers(new) ⇒ Object

.signal_all(signal, children = nil, record = true) ⇒ Object

.spinup(cmd = nil) ⇒ Object

.stop_respawning ⇒ Object

.upgrade_workers ⇒ Object