Class: Interferon::Destinations::Datadog

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/interferon/destinations/datadog.rb

Constant Summary collapse

ALERT_KEY =
'This alert was created via the alerts framework'

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Logging

configure_logger_for, #log, #statsd

Constructor Details

#initialize(options) ⇒ Datadog

Returns a new instance of Datadog.



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/interferon/destinations/datadog.rb', line 14

def initialize(options)
  %w{app_key api_key}.each do |req|
    unless options[req]
      raise ArgumentError, "missing required argument #{req}"
    end
  end

  # Set dogapi timeout explicitly
  api_timeout = options['api_timeout'] || 15

  # Default parameters of Dogapi Client initialize() can be referenced from link below:
  # (as of this writing)
  # https://github.com/DataDog/dogapi-rb/blob/master/lib/dogapi/facade.rb#L14
  args = [
    options['api_key'],
    options['app_key'],
    nil, # host to talk to
    nil, # device
    true, # silent?
    api_timeout, # API timeout
  ]
  @dog = Dogapi::Client.new(*args)

  @existing_alerts = nil
  @dry_run = options['dry_run']

  # create datadog alerts 10 at a time
  @concurrency = options['concurrency'] || 10
  # configure retries
  @retries = options['retries'] || 3

  @stats = {
    :alerts_created => 0,
    :alerts_to_be_created => 0,
    :alerts_updated => 0,
    :alerts_to_be_updated => 0,
    :alerts_deleted => 0,
    :alerts_to_be_deleted => 0,
    :alerts_silenced => 0,
    :api_successes => 0,
    :api_client_errors => 0,
    :api_unknown_errors => 0,
    :manually_created_alerts => 0,
  }
end

Instance Attribute Details

#concurrencyObject

Returns the value of attribute concurrency.



11
12
13
# File 'lib/interferon/destinations/datadog.rb', line 11

def concurrency
  @concurrency
end

Instance Method Details

#api_errorsObject



60
61
62
# File 'lib/interferon/destinations/datadog.rb', line 60

def api_errors
  @api_errors ||= []
end

#create_alert(alert, people) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/interferon/destinations/datadog.rb', line 114

def create_alert(alert, people)
  # create a message which includes the notifications
  message = generate_message(alert['message'], people)

  # create the hash of options to send to datadog
  alert_opts = {
    :name => alert['name'],
    :message => message,
    :silenced => false,
    :notify_no_data => alert['notify_no_data'],
    :timeout_h => nil,
  }

  if @dry_run
    # Datadog may have a race condition where alerts created in a bad state may be triggered
    # during the dry-run creation process. Delete people from dry-run alerts to avoid this
    alert_opts[:message] = generate_message(alert['message'], [])
  end

  # Set alert to be silenced if there is a silenced set or silenced_until set
  if alert['silenced'] || alert['silenced_until'] > Time.now
    alert_opts[:silenced] = true
  end

  # allow an optional timeframe for "no data" alerts to be specified
  # (this feature is supported, even though it's not documented)
  alert_opts[:no_data_timeframe] = alert['no_data_timeframe'] if alert['no_data_timeframe']

  # timeout is in seconds, but set it to 1 hour at least
  alert_opts[:timeout_h] = [1, (alert['timeout'].to_i / 3600)].max if alert['timeout']

  datadog_query = alert['metric']['datadog_query'].strip
  existing_alert = existing_alerts[alert['name']]

  # new alert, create it
  if existing_alert.nil?
    action = :creating
    @stats[:alerts_to_be_created] += 1
    new_alert_text = "Query: #{datadog_query} Message: #{message.split().join(' ')}"
    log.info("creating new alert #{alert['name']}: #{new_alert_text}")

    resp = @dog.alert(
      alert['metric']['datadog_query'].strip,
      alert_opts,
    )

  # existing alert, modify it
  else
    action = :updating
    @stats[:alerts_to_be_updated] += 1
    id = existing_alert['id'][0]

    new_alert_text = "Query:\n#{datadog_query}\nMessage:\n#{message}"
    existing_alert_text = "Query:\n#{existing_alert['query']}\nMessage:\n#{existing_alert['message']}\n"
    diff = Diffy::Diff.new(existing_alert_text, new_alert_text, :context=>1)
    log.info("updating existing alert #{id} (#{alert['name']}): #{diff}")

    if @dry_run
      resp = @dog.alert(
        alert['metric']['datadog_query'].strip,
        alert_opts,
      )
    else
      resp = @dog.update_alert(
        id,
        alert['metric']['datadog_query'].strip,
        alert_opts
      )
      # Unmute existing alerts that have been unsilenced.
      # Datadog does not allow updates to silencing via the update_alert API call.
      if existing_alert['silenced'] && !alert_opts[:silenced]
        @dog.unmute_monitor(id)
      end
    end
  end

  # log whenever we've encountered errors
  code = resp[0].to_i
  log_datadog_response_code(resp, code, action, alert)

  # assume this was a success
  if !(code >= 400 || code == -1)
    # assume this was a success
    @stats[:alerts_created] += 1 if action == :creating
    @stats[:alerts_updated] += 1 if action == :updating
    @stats[:alerts_silenced] += 1 if alert_opts[:silenced]
  end

  id = resp[1].nil? ? nil : [resp[1]['id']]
  # lets key alerts by their name
  return [alert['name'], id]
end

#existing_alertsObject



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/interferon/destinations/datadog.rb', line 78

def existing_alerts
  unless @existing_alerts
    retries = @retries
    begin
      alerts = get_existing_alerts
    rescue
      retries -= 1
      retry if retries >= 0
      raise
    end

    # key alerts by name
    @existing_alerts = {}
    alerts.each do |alert|
      existing_alert = @existing_alerts[alert['name']]
      if existing_alert.nil?
        alert['id'] = [alert['id']]
        @existing_alerts[alert['name']] = alert
      else
        existing_alert['id'] << alert['id']
      end
    end

    # count how many are manually created
    @stats[:manually_created_alerts] = \
      @existing_alerts.reject{|n,a| a['message'].include?(ALERT_KEY)}.length

    log.info "datadog: found %d existing alerts; %d were manually created" % [
      @existing_alerts.length,
      @stats[:manually_created_alerts],
    ]
  end

  return @existing_alerts
end

#generate_message(message, people) ⇒ Object



64
65
66
# File 'lib/interferon/destinations/datadog.rb', line 64

def generate_message(message, people)
  [message, ALERT_KEY, people.map{ |p| "@#{p}" }].flatten.join("\n")
end

#get_existing_alertsObject



68
69
70
71
72
73
74
75
76
# File 'lib/interferon/destinations/datadog.rb', line 68

def get_existing_alerts
  resp = @dog.get_all_alerts()

  code = resp[0].to_i
  if code != 200
    raise "Failed to retrieve existing alerts from datadog. #{code}: #{resp[1].inspect}"
  end
  resp[1]['alerts']
end

#log_datadog_response_code(resp, code, action, alert = nil) ⇒ Object



252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# File 'lib/interferon/destinations/datadog.rb', line 252

def log_datadog_response_code(resp, code, action, alert=nil)
  # log whenever we've encountered errors
  if code != 200 && !alert.nil?
    api_errors << "#{code} on alert #{alert['name']}"
  end

  # client error
  if code == 400
    @stats[:api_client_errors] += 1
    if !alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 1, :tags => ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
      log.error("client error while #{action} alert '#{alert['name']}';" \
                " query was '#{alert['metric']['datadog_query'].strip}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end

    # unknown (prob. datadog) error:
  elsif code > 400 || code == -1
    @stats[:api_unknown_errors] += 1
    if !alert.nil?
      statsd.gauge('datadog.api.unknown_error', 1, :tags => ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
      log.error("unknown error while #{action} alert '#{alert['name']}':" \
                " query was '#{alert['metric']['datadog_query'].strip}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end
  else
    @stats[:api_successes] += 1
    if !alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 1, :tags => ["alert:#{alert}"])
    end
  end
end

#remove_alert(alert) ⇒ Object



207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/interferon/destinations/datadog.rb', line 207

def remove_alert(alert)
  if alert['message'].include?(ALERT_KEY)
    @stats[:alerts_to_be_deleted] += 1
    log.info("deleting alert: #{alert['name']}")

    if !@dry_run
      alert['id'].each do |alert_id|
        resp = @dog.delete_alert(alert_id)
        code = resp[0].to_i
        log_datadog_response_code(resp, code, :deleting)

        if !(code >= 300 || code == -1)
          # assume this was a success
          @stats[:alerts_deleted] += 1
        end
      end
    end
  else
    log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
  end
end

#remove_alert_by_id(alert_id) ⇒ Object



244
245
246
247
248
249
250
# File 'lib/interferon/destinations/datadog.rb', line 244

def remove_alert_by_id(alert_id)
  # This should only be used by dry-run to clean up created dry-run alerts
  log.debug("deleting alert, id: #{alert_id}")
  resp = @dog.delete_alert(alert_id)
  code = resp[0].to_i
  log_datadog_response_code(resp, code, :deleting)
end

#report_statsObject



229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/interferon/destinations/datadog.rb', line 229

def report_stats
  @stats.each do |k,v|
    statsd.gauge("datadog.#{k}", v)
  end

  log.info "datadog: successfully created (%d/%d), updated (%d/%d), and deleted (%d/%d) alerts" % [
    @stats[:alerts_created],
    @stats[:alerts_to_be_created],
    @stats[:alerts_updated],
    @stats[:alerts_to_be_updated],
    @stats[:alerts_deleted],
    @stats[:alerts_to_be_deleted],
  ]
end