Class: Bosh::Agent::Monit

Inherits:
Object show all
Defined in:
lib/bosh_agent/monit.rb

Overview

A good chunk of this code is lifted from the implementation of POSIX::Spawn::Child

Constant Summary collapse

BUFSIZE =
(32 * 1024)
NUM_RETRY_MONIT_INCARNATION =
60
NUM_RETRY_MONIT_WAIT_INCARNATION =
15

Class Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeMonit

Returns a new instance of Monit.



289
290
291
# File 'lib/bosh_agent/monit.rb', line 289

def initialize
  @logger = Bosh::Agent::Config.logger
end

Class Attribute Details

.enabledObject

Returns the value of attribute enabled.



12
13
14
# File 'lib/bosh_agent/monit.rb', line 12

def enabled
  @enabled
end

Class Method Details

.base_dirObject



25
26
27
# File 'lib/bosh_agent/monit.rb', line 25

def base_dir
  Bosh::Agent::Config.base_dir
end

.enableObject

enable supposed to be called in the very beginning as it creates sync primitives. Ideally this class should be refactored to minimize the number of singleton methods having to keep track of the state.



17
18
19
# File 'lib/bosh_agent/monit.rb', line 17

def enable
  @enabled     = true
end

.get_status(num_retries = 10) ⇒ Object



226
227
228
229
230
231
# File 'lib/bosh_agent/monit.rb', line 226

def get_status(num_retries=10)
  return {} unless @enabled
  retry_monit_request(num_retries) do |client|
    client.status(:group => BOSH_APP_GROUP)
  end
end

.get_system_status(num_retries = 10) ⇒ Object



233
234
235
236
237
238
239
240
# File 'lib/bosh_agent/monit.rb', line 233

def get_system_status(num_retries=10)
  return {} unless @enabled
  retry_monit_request(num_retries) do |client|
    system_status = client.status(:type => :system)
    return {} unless system_status.is_a?(Hash)
    system_status.values.first
  end
end

.get_vitals(num_retries = 10) ⇒ Object



242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/bosh_agent/monit.rb', line 242

def get_vitals(num_retries=10)
  return {} unless @enabled
  status = get_system_status(num_retries)
  return {} unless status.is_a?(Hash)

  raw_data = status[:raw] || {}
  sys_data = raw_data["system"] || {}
  loadavg = sys_data["load"] || {}
  cpu = sys_data["cpu"] || {}
  mem = sys_data["memory"] || {}
  swap = sys_data["swap"] || {}

  {
    "load" => [ loadavg["avg01"], loadavg["avg05"], loadavg["avg15"] ],
    "cpu" => { "user" => cpu["user"], "sys" => cpu["system"], "wait" => cpu["wait"] },
    "mem" => { "percent" => mem["percent"], "kb" => mem["kilobyte"] },
    "swap" => { "percent" => swap["percent"], "kb" => swap["kilobyte"] }
  }
end

.incarnationObject

Raises:



209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/bosh_agent/monit.rb', line 209

def incarnation
  NUM_RETRY_MONIT_INCARNATION.times do
    info = monit_info
    if info && info[:incarnation]
      return info[:incarnation].to_i
    end
    sleep 1
  end

  # If we ever get here we have failed to get incarnation
  raise StateError, "Failed to get incarnation from Monit"
end

.loggerObject



29
30
31
# File 'lib/bosh_agent/monit.rb', line 29

def logger
  Bosh::Agent::Config.logger
end

.monit_alerts_fileObject



45
46
47
# File 'lib/bosh_agent/monit.rb', line 45

def monit_alerts_file
  File.join(monit_dir, 'alerts.monitrc')
end

.monit_api_clientObject



59
60
61
62
63
64
65
66
67
# File 'lib/bosh_agent/monit.rb', line 59

def monit_api_client
  # Primarily for CI - normally done during configure
  unless Bosh::Agent::Config.configure
    setup_monit_user
  end

  user, cred = monit_credentials
  MonitClient.new("https://#{user}:#{cred}@127.0.0.1:2822", :logger => logger)
end

.monit_binObject



120
121
122
# File 'lib/bosh_agent/monit.rb', line 120

def monit_bin
  File.join(base_dir, 'bosh', 'bin', 'monit')
end

.monit_credentialsObject



53
54
55
56
57
# File 'lib/bosh_agent/monit.rb', line 53

def monit_credentials
  entry = File.read(monit_user_file).lines.find { |line| line.match(/\A#{BOSH_APP_GROUP}/) }
  user, cred = entry.split(/:/)
  [user, cred.strip]
end

.monit_dirObject



33
34
35
# File 'lib/bosh_agent/monit.rb', line 33

def monit_dir
  File.join(base_dir, 'monit')
end

.monit_events_dirObject



37
38
39
# File 'lib/bosh_agent/monit.rb', line 37

def monit_events_dir
  File.join(monit_dir, 'events')
end

.monit_infoObject



222
223
224
# File 'lib/bosh_agent/monit.rb', line 222

def monit_info
  retry_monit_request { |client| client.monit_info }
end

.monit_reload_cmdObject



154
155
156
157
# File 'lib/bosh_agent/monit.rb', line 154

def monit_reload_cmd
  # Exit code and output has no usable output
  `#{monit_bin} reload`
end

.monit_user_fileObject



41
42
43
# File 'lib/bosh_agent/monit.rb', line 41

def monit_user_file
  File.join(monit_dir, 'monit.user')
end

.monitor_services(attempts = 10) ⇒ Object



169
170
171
172
173
# File 'lib/bosh_agent/monit.rb', line 169

def monitor_services(attempts=10)
  retry_monit_request(attempts) do |client|
    client.monitor(:group => BOSH_APP_GROUP)
  end
end

.monitrcObject



124
125
126
# File 'lib/bosh_agent/monit.rb', line 124

def monitrc
  File.join(base_dir, 'bosh', 'etc', 'monitrc')
end

.random_credentialObject



69
70
71
# File 'lib/bosh_agent/monit.rb', line 69

def random_credential
  OpenSSL::Random.random_bytes(8).unpack("H*")[0]
end

.reloadObject

Raises:



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/bosh_agent/monit.rb', line 128

def reload
  old_incarnation = incarnation
  logger.info("Monit: old incarnation #{old_incarnation}")

  monit_reload_cmd
  logger.info("Monit: reload")

  reload_start = Time.now.to_i
  loop do
    check_incarnation = incarnation
    if old_incarnation < check_incarnation
      logger.info("Monit: updated incarnation #{check_incarnation}")
      return
    end
    sleep reload_incarnation_sleep
    break if Time.now.to_i > (reload_start + reload_timeout)
  end

  # If we ever get here we have failed to get the new incarnation
  raise StateError, "Failed to get updated incarnation from Monit"
end

.reload_incarnation_sleepObject



159
160
161
# File 'lib/bosh_agent/monit.rb', line 159

def reload_incarnation_sleep
  5
end

.reload_timeoutObject



150
151
152
# File 'lib/bosh_agent/monit.rb', line 150

def reload_timeout
  300
end

.retry_monit_request(attempts = 10) ⇒ Object



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/bosh_agent/monit.rb', line 187

def retry_monit_request(attempts=10)
  # HACK: Monit becomes unresponsive after reload
  begin
    yield monit_api_client if block_given?
  rescue Errno::ECONNREFUSED, TimeoutError
    sleep 1
    logger.info("Monit Service Connection Refused: retrying")
    retry if (attempts -= 1) > 0
  rescue => e
    messages = [
      "Connection reset by peer",
      "Service Unavailable"
    ]
    if messages.include?(e.message)
      logger.info("Monit Service Unavailable (#{e.message}): retrying")
      sleep 1
      retry if (attempts -= 1) > 0
    end
    raise e
  end
end

.service_group_state(num_retries = 10) ⇒ Object



262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# File 'lib/bosh_agent/monit.rb', line 262

def service_group_state(num_retries=10)
  # FIXME: state should be unknown if monit is disabled
  # However right now that would break director interaction
  # (at least in integration tests)
  return "running" unless @enabled
  status = get_status(num_retries)

  return "starting" if status.any? { |_, job_status| starting?(job_status) }

  not_running = status.reject do |name, data|
    # break early if any service is initializing
    # at least with monit_api a stopped services is still running
    (data[:monitor] == :yes && data[:status][:message] == "running")
  end

  not_running.empty? ? "running" : "failing"
rescue => e
  logger.info("Unable to determine job state: #{e}")
  "unknown"
end

.setup_alertsObject

This and other methods could probably be refactored into a separate management class to avoid keeping all this state in a metaclass (as it’s weird to test)



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/bosh_agent/monit.rb', line 89

def setup_alerts
  return unless Config.process_alerts

  alerts_config = <<-CONFIG
  set alert bosh@localhost
  set mailserver 127.0.0.1 port #{Config.smtp_port}
      username "#{Config.smtp_user}" password "#{Config.smtp_password}"

  set eventqueue
      basedir #{monit_events_dir}
      slots 5000

  set mail-format {
    from: monit@localhost
    subject: Monit Alert
    message: Service: $SERVICE
    Event: $EVENT
    Action: $ACTION
    Date: $DATE
    Description: $DESCRIPTION
  }
  CONFIG

  setup_monit_dir
  FileUtils.mkdir_p(monit_events_dir)

  File.open(monit_alerts_file, 'w') do |f|
    f.puts(alerts_config)
  end
end

.setup_monit_dirObject



73
74
75
76
# File 'lib/bosh_agent/monit.rb', line 73

def setup_monit_dir
  FileUtils.mkdir_p(monit_dir)
  FileUtils.chmod(0700, monit_dir)
end

.setup_monit_userObject



78
79
80
81
82
83
84
85
# File 'lib/bosh_agent/monit.rb', line 78

def setup_monit_user
  unless File.exist?(monit_user_file)
    setup_monit_dir
    File.open(monit_user_file, 'w') do |f|
      f.puts("vcap:#{random_credential}")
    end
  end
end

.smtp_portObject



49
50
51
# File 'lib/bosh_agent/monit.rb', line 49

def smtp_port
  Bosh::Agent::Config.smtp_port
end

.startObject



21
22
23
# File 'lib/bosh_agent/monit.rb', line 21

def start
  new.run
end

.start_services(attempts = 20) ⇒ Object



175
176
177
178
179
# File 'lib/bosh_agent/monit.rb', line 175

def start_services(attempts=20)
  retry_monit_request(attempts) do |client|
    client.start(:group => BOSH_APP_GROUP)
  end
end

.starting?(status) ⇒ Boolean

Returns:

  • (Boolean)


283
284
285
# File 'lib/bosh_agent/monit.rb', line 283

def starting?(status)
  status[:monitor] == :init
end

.stop_services(attempts = 20) ⇒ Object



181
182
183
184
185
# File 'lib/bosh_agent/monit.rb', line 181

def stop_services(attempts=20)
  retry_monit_request(attempts) do |client|
    client.stop(:group => BOSH_APP_GROUP)
  end
end

.unmonitor_services(attempts = 10) ⇒ Object



163
164
165
166
167
# File 'lib/bosh_agent/monit.rb', line 163

def unmonitor_services(attempts=10)
  retry_monit_request(attempts) do |client|
    client.unmonitor(:group => BOSH_APP_GROUP)
  end
end

Instance Method Details

#exec_monitObject



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# File 'lib/bosh_agent/monit.rb', line 297

def exec_monit
  status = nil

  stdout_rd, stdout_wr = IO.pipe()
  stderr_rd, stderr_wr = IO.pipe()
  pid = Process.spawn("#{Monit.monit_bin} -I -c #{Monit.monitrc}", :in => :close, :out => stdout_wr, :err=> stderr_wr)

  at_exit {
    Process.kill('TERM', pid) rescue nil
    Process.waitpid(pid)      rescue nil
  }

  log_monit_output(stdout_rd, stderr_rd)

  status = Process.waitpid(pid) rescue nil
rescue => e
  @logger.error("Failed to run Monit: #{e.inspect} #{e.backtrace}")

  [stdin, stdout, stderr].each { |fd| fd.close rescue nil }

  if status.nil?
    Process.kill('TERM', pid) rescue nil
    Process.waitpid(pid)      rescue nil
  end

  raise
ensure
  [stdin, stdout, stderr].each { |fd| fd.close rescue nil }
end

#log_monit_output(stdout, stderr) ⇒ Object



327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# File 'lib/bosh_agent/monit.rb', line 327

def log_monit_output(stdout, stderr)
  timeout = nil
  out, err = '', ''
  readers = [stdout, stderr]
  writers = []

  while readers.any?
    ready = IO.select(readers, writers, readers + writers, timeout)
    ready[0].each do |fd|
      buf = (fd == stdout) ? out : err
      begin
        buf << fd.readpartial(BUFSIZE)
      rescue Errno::EAGAIN, Errno::EINTR
      rescue EOFError
        readers.delete(fd)
        fd.close
      end
      buf.gsub!(/\n\Z/,'')
      @logger.info("Monit: #{buf}")
    end
    out, err = '', ''
  end

end

#runObject



293
294
295
# File 'lib/bosh_agent/monit.rb', line 293

def run
  Thread.new { exec_monit }
end