Class: Bosh::Agent::Monit
Overview
A good chunk of this code is lifted from the implementation of POSIX::Spawn::Child
Constant Summary collapse
- BUFSIZE =
(32 * 1024)
- NUM_RETRY_MONIT_INCARNATION =
60
- NUM_RETRY_MONIT_WAIT_INCARNATION =
15
Class Attribute Summary collapse
-
.enabled ⇒ Object
Returns the value of attribute enabled.
Class Method Summary collapse
- .base_dir ⇒ Object
-
.enable ⇒ Object
enable supposed to be called in the very beginning as it creates sync primitives.
- .get_status(num_retries = 10) ⇒ Object
- .get_system_status(num_retries = 10) ⇒ Object
- .get_vitals(num_retries = 10) ⇒ Object
- .incarnation ⇒ Object
- .logger ⇒ Object
- .monit_alerts_file ⇒ Object
- .monit_api_client ⇒ Object
- .monit_bin ⇒ Object
- .monit_credentials ⇒ Object
- .monit_dir ⇒ Object
- .monit_events_dir ⇒ Object
- .monit_info ⇒ Object
- .monit_reload_cmd ⇒ Object
- .monit_user_file ⇒ Object
- .monitor_services(attempts = 10) ⇒ Object
- .monitrc ⇒ Object
- .random_credential ⇒ Object
- .reload ⇒ Object
- .reload_incarnation_sleep ⇒ Object
- .reload_timeout ⇒ Object
- .retry_monit_request(attempts = 10) ⇒ Object
- .service_group_state(num_retries = 10) ⇒ Object
-
.setup_alerts ⇒ Object
This and other methods could probably be refactored into a separate management class to avoid keeping all this state in a metaclass (as it’s weird to test).
- .setup_monit_dir ⇒ Object
- .setup_monit_user ⇒ Object
- .smtp_port ⇒ Object
- .start ⇒ Object
- .start_services(attempts = 20) ⇒ Object
- .starting?(status) ⇒ Boolean
- .stop_services(attempts = 20) ⇒ Object
- .unmonitor_services(attempts = 10) ⇒ Object
Instance Method Summary collapse
- #exec_monit ⇒ Object
-
#initialize ⇒ Monit
constructor
A new instance of Monit.
- #log_monit_output(stdout, stderr) ⇒ Object
- #run ⇒ Object
Constructor Details
Class Attribute Details
.enabled ⇒ Object
Returns the value of attribute enabled.
12 13 14 |
# File 'lib/bosh_agent/monit.rb', line 12 def enabled @enabled end |
Class Method Details
.base_dir ⇒ Object
25 26 27 |
# File 'lib/bosh_agent/monit.rb', line 25 def base_dir Bosh::Agent::Config.base_dir end |
.enable ⇒ Object
enable supposed to be called in the very beginning as it creates sync primitives. Ideally this class should be refactored to minimize the number of singleton methods having to keep track of the state.
17 18 19 |
# File 'lib/bosh_agent/monit.rb', line 17 def enable @enabled = true end |
.get_status(num_retries = 10) ⇒ Object
226 227 228 229 230 231 |
# File 'lib/bosh_agent/monit.rb', line 226 def get_status(num_retries=10) return {} unless @enabled retry_monit_request(num_retries) do |client| client.status(:group => BOSH_APP_GROUP) end end |
.get_system_status(num_retries = 10) ⇒ Object
233 234 235 236 237 238 239 240 |
# File 'lib/bosh_agent/monit.rb', line 233 def get_system_status(num_retries=10) return {} unless @enabled retry_monit_request(num_retries) do |client| system_status = client.status(:type => :system) return {} unless system_status.is_a?(Hash) system_status.values.first end end |
.get_vitals(num_retries = 10) ⇒ Object
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
# File 'lib/bosh_agent/monit.rb', line 242 def get_vitals(num_retries=10) return {} unless @enabled status = get_system_status(num_retries) return {} unless status.is_a?(Hash) raw_data = status[:raw] || {} sys_data = raw_data["system"] || {} loadavg = sys_data["load"] || {} cpu = sys_data["cpu"] || {} mem = sys_data["memory"] || {} swap = sys_data["swap"] || {} { "load" => [ loadavg["avg01"], loadavg["avg05"], loadavg["avg15"] ], "cpu" => { "user" => cpu["user"], "sys" => cpu["system"], "wait" => cpu["wait"] }, "mem" => { "percent" => mem["percent"], "kb" => mem["kilobyte"] }, "swap" => { "percent" => swap["percent"], "kb" => swap["kilobyte"] } } end |
.incarnation ⇒ Object
209 210 211 212 213 214 215 216 217 218 219 220 |
# File 'lib/bosh_agent/monit.rb', line 209 def incarnation NUM_RETRY_MONIT_INCARNATION.times do info = monit_info if info && info[:incarnation] return info[:incarnation].to_i end sleep 1 end # If we ever get here we have failed to get incarnation raise StateError, "Failed to get incarnation from Monit" end |
.logger ⇒ Object
29 30 31 |
# File 'lib/bosh_agent/monit.rb', line 29 def logger Bosh::Agent::Config.logger end |
.monit_alerts_file ⇒ Object
45 46 47 |
# File 'lib/bosh_agent/monit.rb', line 45 def monit_alerts_file File.join(monit_dir, 'alerts.monitrc') end |
.monit_api_client ⇒ Object
59 60 61 62 63 64 65 66 67 |
# File 'lib/bosh_agent/monit.rb', line 59 def monit_api_client # Primarily for CI - normally done during configure unless Bosh::Agent::Config.configure setup_monit_user end user, cred = monit_credentials MonitClient.new("https://#{user}:#{cred}@127.0.0.1:2822", :logger => logger) end |
.monit_bin ⇒ Object
120 121 122 |
# File 'lib/bosh_agent/monit.rb', line 120 def monit_bin File.join(base_dir, 'bosh', 'bin', 'monit') end |
.monit_credentials ⇒ Object
53 54 55 56 57 |
# File 'lib/bosh_agent/monit.rb', line 53 def monit_credentials entry = File.read(monit_user_file).lines.find { |line| line.match(/\A#{BOSH_APP_GROUP}/) } user, cred = entry.split(/:/) [user, cred.strip] end |
.monit_dir ⇒ Object
33 34 35 |
# File 'lib/bosh_agent/monit.rb', line 33 def monit_dir File.join(base_dir, 'monit') end |
.monit_events_dir ⇒ Object
37 38 39 |
# File 'lib/bosh_agent/monit.rb', line 37 def monit_events_dir File.join(monit_dir, 'events') end |
.monit_info ⇒ Object
222 223 224 |
# File 'lib/bosh_agent/monit.rb', line 222 def monit_info retry_monit_request { |client| client.monit_info } end |
.monit_reload_cmd ⇒ Object
154 155 156 157 |
# File 'lib/bosh_agent/monit.rb', line 154 def monit_reload_cmd # Exit code and output has no usable output `#{monit_bin} reload` end |
.monit_user_file ⇒ Object
41 42 43 |
# File 'lib/bosh_agent/monit.rb', line 41 def monit_user_file File.join(monit_dir, 'monit.user') end |
.monitor_services(attempts = 10) ⇒ Object
169 170 171 172 173 |
# File 'lib/bosh_agent/monit.rb', line 169 def monitor_services(attempts=10) retry_monit_request(attempts) do |client| client.monitor(:group => BOSH_APP_GROUP) end end |
.monitrc ⇒ Object
124 125 126 |
# File 'lib/bosh_agent/monit.rb', line 124 def monitrc File.join(base_dir, 'bosh', 'etc', 'monitrc') end |
.random_credential ⇒ Object
69 70 71 |
# File 'lib/bosh_agent/monit.rb', line 69 def random_credential OpenSSL::Random.random_bytes(8).unpack("H*")[0] end |
.reload ⇒ Object
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# File 'lib/bosh_agent/monit.rb', line 128 def reload old_incarnation = incarnation logger.info("Monit: old incarnation #{old_incarnation}") monit_reload_cmd logger.info("Monit: reload") reload_start = Time.now.to_i loop do check_incarnation = incarnation if old_incarnation < check_incarnation logger.info("Monit: updated incarnation #{check_incarnation}") return end sleep reload_incarnation_sleep break if Time.now.to_i > (reload_start + reload_timeout) end # If we ever get here we have failed to get the new incarnation raise StateError, "Failed to get updated incarnation from Monit" end |
.reload_incarnation_sleep ⇒ Object
159 160 161 |
# File 'lib/bosh_agent/monit.rb', line 159 def reload_incarnation_sleep 5 end |
.reload_timeout ⇒ Object
150 151 152 |
# File 'lib/bosh_agent/monit.rb', line 150 def reload_timeout 300 end |
.retry_monit_request(attempts = 10) ⇒ Object
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/bosh_agent/monit.rb', line 187 def retry_monit_request(attempts=10) # HACK: Monit becomes unresponsive after reload begin yield monit_api_client if block_given? rescue Errno::ECONNREFUSED, TimeoutError sleep 1 logger.info("Monit Service Connection Refused: retrying") retry if (attempts -= 1) > 0 rescue => e = [ "Connection reset by peer", "Service Unavailable" ] if .include?(e.) logger.info("Monit Service Unavailable (#{e.}): retrying") sleep 1 retry if (attempts -= 1) > 0 end raise e end end |
.service_group_state(num_retries = 10) ⇒ Object
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 |
# File 'lib/bosh_agent/monit.rb', line 262 def service_group_state(num_retries=10) # FIXME: state should be unknown if monit is disabled # However right now that would break director interaction # (at least in integration tests) return "running" unless @enabled status = get_status(num_retries) return "starting" if status.any? { |_, job_status| starting?(job_status) } not_running = status.reject do |name, data| # break early if any service is initializing # at least with monit_api a stopped services is still running (data[:monitor] == :yes && data[:status][:message] == "running") end not_running.empty? ? "running" : "failing" rescue => e logger.info("Unable to determine job state: #{e}") "unknown" end |
.setup_alerts ⇒ Object
This and other methods could probably be refactored into a separate management class to avoid keeping all this state in a metaclass (as it’s weird to test)
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/bosh_agent/monit.rb', line 89 def setup_alerts return unless Config.process_alerts alerts_config = <<-CONFIG set alert bosh@localhost set mailserver 127.0.0.1 port #{Config.smtp_port} username "#{Config.smtp_user}" password "#{Config.smtp_password}" set eventqueue basedir #{monit_events_dir} slots 5000 set mail-format { from: monit@localhost subject: Monit Alert message: Service: $SERVICE Event: $EVENT Action: $ACTION Date: $DATE Description: $DESCRIPTION } CONFIG setup_monit_dir FileUtils.mkdir_p(monit_events_dir) File.open(monit_alerts_file, 'w') do |f| f.puts(alerts_config) end end |
.setup_monit_dir ⇒ Object
73 74 75 76 |
# File 'lib/bosh_agent/monit.rb', line 73 def setup_monit_dir FileUtils.mkdir_p(monit_dir) FileUtils.chmod(0700, monit_dir) end |
.setup_monit_user ⇒ Object
78 79 80 81 82 83 84 85 |
# File 'lib/bosh_agent/monit.rb', line 78 def setup_monit_user unless File.exist?(monit_user_file) setup_monit_dir File.open(monit_user_file, 'w') do |f| f.puts("vcap:#{random_credential}") end end end |
.smtp_port ⇒ Object
49 50 51 |
# File 'lib/bosh_agent/monit.rb', line 49 def smtp_port Bosh::Agent::Config.smtp_port end |
.start ⇒ Object
21 22 23 |
# File 'lib/bosh_agent/monit.rb', line 21 def start new.run end |
.start_services(attempts = 20) ⇒ Object
175 176 177 178 179 |
# File 'lib/bosh_agent/monit.rb', line 175 def start_services(attempts=20) retry_monit_request(attempts) do |client| client.start(:group => BOSH_APP_GROUP) end end |
.starting?(status) ⇒ Boolean
283 284 285 |
# File 'lib/bosh_agent/monit.rb', line 283 def starting?(status) status[:monitor] == :init end |
.stop_services(attempts = 20) ⇒ Object
181 182 183 184 185 |
# File 'lib/bosh_agent/monit.rb', line 181 def stop_services(attempts=20) retry_monit_request(attempts) do |client| client.stop(:group => BOSH_APP_GROUP) end end |
.unmonitor_services(attempts = 10) ⇒ Object
163 164 165 166 167 |
# File 'lib/bosh_agent/monit.rb', line 163 def unmonitor_services(attempts=10) retry_monit_request(attempts) do |client| client.unmonitor(:group => BOSH_APP_GROUP) end end |
Instance Method Details
#exec_monit ⇒ Object
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
# File 'lib/bosh_agent/monit.rb', line 297 def exec_monit status = nil stdout_rd, stdout_wr = IO.pipe() stderr_rd, stderr_wr = IO.pipe() pid = Process.spawn("#{Monit.monit_bin} -I -c #{Monit.monitrc}", :in => :close, :out => stdout_wr, :err=> stderr_wr) at_exit { Process.kill('TERM', pid) rescue nil Process.waitpid(pid) rescue nil } log_monit_output(stdout_rd, stderr_rd) status = Process.waitpid(pid) rescue nil rescue => e @logger.error("Failed to run Monit: #{e.inspect} #{e.backtrace}") [stdin, stdout, stderr].each { |fd| fd.close rescue nil } if status.nil? Process.kill('TERM', pid) rescue nil Process.waitpid(pid) rescue nil end raise ensure [stdin, stdout, stderr].each { |fd| fd.close rescue nil } end |
#log_monit_output(stdout, stderr) ⇒ Object
327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 |
# File 'lib/bosh_agent/monit.rb', line 327 def log_monit_output(stdout, stderr) timeout = nil out, err = '', '' readers = [stdout, stderr] writers = [] while readers.any? ready = IO.select(readers, writers, readers + writers, timeout) ready[0].each do |fd| buf = (fd == stdout) ? out : err begin buf << fd.readpartial(BUFSIZE) rescue Errno::EAGAIN, Errno::EINTR rescue EOFError readers.delete(fd) fd.close end buf.gsub!(/\n\Z/,'') @logger.info("Monit: #{buf}") end out, err = '', '' end end |
#run ⇒ Object
293 294 295 |
# File 'lib/bosh_agent/monit.rb', line 293 def run Thread.new { exec_monit } end |