Class: Ring::SQA::Alarm

Inherits:
Object
  • Object
show all
Defined in:
lib/ring/sqa/alarm.rb,
lib/ring/sqa/alarm/cfg.rb,
lib/ring/sqa/alarm/exec.rb,
lib/ring/sqa/alarm/email.rb,
lib/ring/sqa/alarm/slack.rb,
lib/ring/sqa/alarm/message.rb,
lib/ring/sqa/alarm/udp2irc.rb,
lib/ring/sqa/alarm/collector.rb

Defined Under Namespace

Classes: Collector, Email, Exec, Slack, UDP2IRC

Constant Summary collapse

Config =
Asetus.new name: 'sqa', load: false, usrdir: Directory, cfgfile: 'alarm.conf'
CFG =
Config.cfg

Instance Method Summary collapse

Instance Method Details

#clear(alarm_buffer) ⇒ Object



24
25
26
27
28
29
30
31
32
# File 'lib/ring/sqa/alarm.rb', line 24

def clear alarm_buffer
  if @alarm == true
    @alarm = false
    msg = { short: "#{@hostname}: clearing #{@afi} alarm" }
    msg[:long] = msg[:short]
    Log.info msg[:short]
    @methods.each { |alarm_method| alarm_send alarm_method, 'clear', msg, alarm_buffer } if CFG.recovery.notify?
  end
end

#message(nodes_list, mtr_list, buffer_list, amount) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/ring/sqa/alarm/message.rb', line 5

def message nodes_list, mtr_list, buffer_list, amount
"Regarding: #{Ring::SQA::CFG.host.name} #{Ring::SQA::CFG.afi}

This is an automated alert from the distributed partial outage
monitoring system 'RING SQA'.

At #{Time.now.utc} the following measurements were analysed
as indicating that there is a high probability your NLNOG RING node cannot
reach the entire internet. This could be down to your RING node, its local
network, or disruption of peering and/or upstream networks (for example
instability at an IXP or one of your transit providers).

The following #{amount} nodes previously were reachable, but became unreachable
over the course of the last 3 minutes:

#{nodes_list}

As a debug starting point 3 traceroutes were launched right after
detecting the event, they might assist in pinpointing what broke:

#{mtr_list}

An alarm is raised under the following conditions: every 30 seconds
your node pings all other nodes. The amount of nodes that cannot be
reached is stored in a circular buffer, with each element representing
a minute of measurements. In the event that the last three minutes are
#{Ring::SQA::CFG.analyzer.tolerance.relative} above the median of the previous #{Ring::SQA::CFG.analyzer.median_of} measurement slots, a partial
outage is assumed. The ring buffer's output is as following:

#{buffer_list}

Kind regards,

NLNOG RING
"
end

#set(alarm_buffer) ⇒ Object



15
16
17
18
19
20
21
22
# File 'lib/ring/sqa/alarm.rb', line 15

def set alarm_buffer
  if @alarm == false
    @alarm = true
    msg = compose_message alarm_buffer
    Log.info msg[:short]
    @methods.each { |alarm_method| alarm_send alarm_method, 'raise', msg, alarm_buffer }
  end
end