Class: Interferon::Destinations::Datadog

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/interferon/destinations/datadog.rb

Constant Summary collapse

ALERT_KEY =
'This alert was created via the alerts framework'.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Logging

configure_logger_for, #log, #statsd

Constructor Details

#initialize(options) ⇒ Datadog

Returns a new instance of Datadog.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/interferon/destinations/datadog.rb', line 16

def initialize(options)
  %w(app_key api_key).each do |req|
    unless options[req]
      raise ArgumentError, "missing required argument #{req}"
    end
  end

  # Set dogapi timeout explicitly
  api_timeout = options['api_timeout'] || 15

  # Default parameters of Dogapi Client initialize() can be referenced from link below:
  # (as of this writing)
  # https://github.com/DataDog/dogapi-rb/blob/master/lib/dogapi/facade.rb#L14
  args = [
    options['api_key'],
    options['app_key'],
    nil, # host to talk to
    nil, # device
    true, # silent?
    api_timeout, # API timeout
  ]
  @dog = Dogapi::Client.new(*args)

  @existing_alerts = nil
  @max_mute_minutes = options['max_mute_minutes']
  @dry_run = options['dry_run']

  # Datadog communication threads
  @concurrency = options['concurrency'] || 10
  # Fetch page size
  @page_size = options['page_size'] || 1000

  # configure retries
  @retries = options['retries'] || 3

  @stats = {
    alerts_created: 0,
    alerts_to_be_created: 0,
    alerts_updated: 0,
    alerts_to_be_updated: 0,
    alerts_deleted: 0,
    alerts_to_be_deleted: 0,
    alerts_silenced: 0,
    api_successes: 0,
    api_client_errors: 0,
    api_unknown_errors: 0,
    manually_created_alerts: 0,
  }
end

Instance Attribute Details

#concurrencyObject

Returns the value of attribute concurrency.



13
14
15
# File 'lib/interferon/destinations/datadog.rb', line 13

def concurrency
  @concurrency
end

Class Method Details

.generate_message(message, people, options = {}) ⇒ Object



70
71
72
73
74
75
76
77
78
79
# File 'lib/interferon/destinations/datadog.rb', line 70

def self.generate_message(message, people, options = {})
  mentions = people.sort.map { |p| "@#{p}" }

  unless options[:notify_recovery]
    # Only mention on alert/warning
    mentions = "{{^is_recovery}}#{mentions}{{/is_recovery}}"
  end

  [message, ALERT_KEY, mentions].flatten.join("\n")
end

.normalize_monitor_type(monitor_type) ⇒ Object



334
335
336
337
338
339
340
# File 'lib/interferon/destinations/datadog.rb', line 334

def self.normalize_monitor_type(monitor_type)
  # Convert 'query alert' type to 'metric alert' type. They can used interchangeably when
  # submitting monitors to Datadog. Datadog will automatically do the conversion to 'query
  # alert' for a "complex" query that includes multiple metrics/tags while using 'metric alert'
  # for monitors that include a single scope/metric.
  monitor_type == 'query alert' ? 'metric alert' : monitor_type
end

.same_alerts(alert, people, alert_api_json) ⇒ Object



346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# File 'lib/interferon/destinations/datadog.rb', line 346

def self.same_alerts(alert, people, alert_api_json)
  prev_alert = {
    monitor_type: normalize_monitor_type(alert_api_json['type']),
    query: alert_api_json['query'].strip,
    message: alert_api_json['message'].strip,
    evaluation_delay: alert_api_json['options']['evaluation_delay'],
    include_tags: alert_api_json['options']['include_tags'],
    notify_no_data: alert_api_json['options']['notify_no_data'],
    notify_audit: alert_api_json['options']['notify_audit'],
    no_data_timeframe: alert_api_json['options']['no_data_timeframe'],
    silenced: alert_api_json['options']['silenced'],
    thresholds: alert_api_json['options']['thresholds'],
    timeout_h: alert_api_json['options']['timeout_h'],
  }

  new_alert = {
    monitor_type: normalize_monitor_type(alert['monitor_type']),
    query: alert['metric']['datadog_query'],
    message: generate_message(
      alert['message'],
      people,
      notify_recovery: alert['notify']['recovery']
    ).strip,
    evaluation_delay: alert['evaluation_delay'],
    include_tags: alert['notify']['include_tags'],
    notify_no_data: alert['notify_no_data'],
    notify_audit: alert['notify']['audit'],
    no_data_timeframe: alert['no_data_timeframe'],
    silenced: alert['silenced'],
    thresholds: alert['thresholds'],
    timeout_h: alert['timeout_h'],
  }

  unless alert['require_full_window'].nil?
    prev_alert[:require_full_window] = alert_api_json['options']['require_full_window']
    new_alert[:require_full_window] = alert['require_full_window']
  end

  prev_alert == new_alert
end

.same_monitor_type(monitor_type_a, monitor_type_b) ⇒ Object



342
343
344
# File 'lib/interferon/destinations/datadog.rb', line 342

def self.same_monitor_type(monitor_type_a, monitor_type_b)
  normalize_monitor_type(monitor_type_a) == normalize_monitor_type(monitor_type_b)
end

Instance Method Details

#api_errorsObject



66
67
68
# File 'lib/interferon/destinations/datadog.rb', line 66

def api_errors
  @api_errors ||= []
end

#create_alert(alert, people) ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/interferon/destinations/datadog.rb', line 139

def create_alert(alert, people)
  # create a message which includes the notifications
  # Datadog may have a race condition where alerts created in a bad state may be triggered
  # during the dry-run creation process. Delete people from dry-run alerts to avoid this
  message = self.class.generate_message(
    alert['message'],
    people,
    notify_recovery: alert['notify']['recovery']
  )

  # create the hash of options to send to datadog
  alert_options = {
    notify_audit: alert['notify']['audit'],
    notify_no_data: alert['notify_no_data'],
    no_data_timeframe: alert['no_data_timeframe'],
    silenced: alert['silenced'],
    timeout_h: alert['timeout_h'],
  }

  unless alert['notify']['include_tags'].nil?
    alert_options[:include_tags] = alert['notify']['include_tags']
  end

  unless alert['evaluation_delay'].nil?
    alert_options[:evaluation_delay] = alert['evaluation_delay']
  end

  unless alert['require_full_window'].nil?
    alert_options[:require_full_window] = alert['require_full_window']
  end

  unless alert['thresholds'].nil?
    alert_options[:thresholds] = alert['thresholds']
  end

  datadog_query = alert['metric']['datadog_query']
  existing_alert = existing_alerts[alert['name']]

  # new alert, create it
  if existing_alert.nil?
    action = :creating
    resp = create_datadog_alert(alert, datadog_query, message, alert_options)
  else
    # existing alert, modify it
    action = :updating
    resp = update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert)
  end

  # log whenever we've encountered errors
  code = resp[0].to_i
  log_datadog_response_code(resp, code, action, alert)

  # assume this was a success
  unless code >= 400 || code == -1
    # assume this was a success
    @stats[:alerts_created] += 1 if action == :creating
    @stats[:alerts_updated] += 1 if action == :updating
    @stats[:alerts_silenced] += 1 unless alert_options[:silenced].empty?
  end

  id = resp[1].nil? ? nil : [resp[1]['id']]
  # lets key alerts by their name
  [alert['name'], id]
end

#create_datadog_alert(alert, datadog_query, message, alert_options) ⇒ Object



204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# File 'lib/interferon/destinations/datadog.rb', line 204

def create_datadog_alert(alert, datadog_query, message, alert_options)
  @stats[:alerts_to_be_created] += 1
  new_alert_text = <<-EOM
Query:
#{datadog_query}
Message:
#{message}
Options:
#{alert_options}
EOM
  log.info("creating new alert #{alert['name']}: #{new_alert_text}")

  monitor_options = {
    name: alert['name'],
    message: message,
    options: alert_options,
  }

  if @dry_run
    @dog.validate_monitor(
      alert['monitor_type'],
      datadog_query,
      monitor_options
    )
  else
    @dog.monitor(
      alert['monitor_type'],
      datadog_query,
      monitor_options
    )
  end
end

#existing_alertsObject



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/interferon/destinations/datadog.rb', line 110

def existing_alerts
  unless @existing_alerts
    alerts = fetch_existing_alerts

    # key alerts by name
    @existing_alerts = {}
    alerts.each do |alert|
      existing_alert = @existing_alerts[alert['name']]
      if existing_alert.nil?
        alert['id'] = [alert['id']]
        @existing_alerts[alert['name']] = alert
      else
        existing_alert['id'] << alert['id']
      end
    end

    # count how many are manually created
    @stats[:manually_created_alerts] = \
      @existing_alerts.reject { |_n, a| a['message'].include?(ALERT_KEY) }.length

    log.info(
      "datadog: found #{@existing_alerts.length} existing alerts; " \
      "#{@stats[:manually_created_alerts]} were manually created"
    )
  end

  @existing_alerts
end

#fetch_existing_alertsObject



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/interferon/destinations/datadog.rb', line 81

def fetch_existing_alerts
  alerts = Queue.new
  has_more = true

  Parallel.map_with_index(-> { has_more || Parallel::Stop },
                          in_threads: @concurrency) do |_, page|
    successful = false
    @retries.downto(0) do
      resp = @dog.get_all_monitors(page: page, page_size: @page_size)
      code = resp[0].to_i
      if code != 200
        log.info("Failed to retrieve existing alerts from datadog. #{code}: #{resp[1].inspect}")
      else
        alerts_page = resp[1]
        has_more = false if alerts_page.length < @page_size
        alerts_page.map { |alert| alerts.push(alert) }
        successful = true
        break
      end
    end

    unless successful
      # Out of retries
      raise 'Retries exceeded for fetching data from datadog.'
    end
  end
  Array.new(alerts.size) { alerts.pop }
end

#log_datadog_response_code(resp, code, action, alert = nil) ⇒ Object



404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
# File 'lib/interferon/destinations/datadog.rb', line 404

def log_datadog_response_code(resp, code, action, alert = nil)
  # log whenever we've encountered errors
  if code != 200 && !alert.nil?
    api_errors << "#{code} on alert #{alert['name']}"
  end

  # client error
  if code == 400
    @stats[:api_client_errors] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 1, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"])
      log.error("client error while #{action} alert '#{alert['name']}';" \
                " query was '#{alert['metric']['datadog_query']}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end

  # unknown (prob. datadog) error:
  elsif code > 400 || code == -1
    @stats[:api_unknown_errors] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 1, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"])
      log.error("unknown error while #{action} alert '#{alert['name']}':" \
                " query was '#{alert['metric']['datadog_query']}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end
  else
    @stats[:api_successes] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 1, tags: ["alert:#{alert}"])
    end
  end
end

#need_update(alert_people_pair, existing_alerts_from_api) ⇒ Object



328
329
330
331
332
# File 'lib/interferon/destinations/datadog.rb', line 328

def need_update(alert_people_pair, existing_alerts_from_api)
  alert, people = alert_people_pair
  existing = existing_alerts_from_api[alert['name']]
  existing.nil? || !self.class.same_alerts(alert, people, existing)
end

#remove_alert(alert) ⇒ Object



305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# File 'lib/interferon/destinations/datadog.rb', line 305

def remove_alert(alert)
  if alert['message'].include?(ALERT_KEY)
    @stats[:alerts_to_be_deleted] += 1
    log.info("deleting alert: #{alert['name']}")

    # Safety to protect aginst accident dry_run deletion
    unless @dry_run
      alert['id'].each do |alert_id|
        resp = @dog.delete_monitor(alert_id)
        code = resp[0].to_i
        log_datadog_response_code(resp, code, :deleting)

        unless code >= 300 || code == -1
          # assume this was a success
          @stats[:alerts_deleted] += 1
        end
      end
    end
  else
    log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
  end
end

#report_statsObject



387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# File 'lib/interferon/destinations/datadog.rb', line 387

def report_stats
  @stats.each do |k, v|
    statsd.gauge("datadog.#{k}", v)
  end

  log.info(
    'datadog: successfully created (%d/%d), updated (%d/%d), and deleted (%d/%d) alerts' % [
      @stats[:alerts_created],
      @stats[:alerts_to_be_created],
      @stats[:alerts_updated],
      @stats[:alerts_to_be_updated],
      @stats[:alerts_deleted],
      @stats[:alerts_to_be_deleted],
    ]
  )
end

#update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert) ⇒ Object



237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# File 'lib/interferon/destinations/datadog.rb', line 237

def update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert)
  @stats[:alerts_to_be_updated] += 1
  id = existing_alert['id'][0]

  new_alert_text = <<-EOM.strip
Query:
#{datadog_query.strip}
Message:
#{message.strip}
Options:
#{alert_options}
EOM
  existing_alert_text = <<-EOM.strip
Query:
#{existing_alert['query'].strip}
Message:
#{existing_alert['message'].strip}
Options:
#{alert_options}
EOM
  diff = Diffy::Diff.new(existing_alert_text, new_alert_text, context: 1)
  log.info("updating existing alert #{id} (#{alert['name']}):\n#{diff}")

  monitor_options = {
    name: alert['name'],
    message: message,
    options: alert_options,
  }

  if @dry_run
    resp = @dog.validate_monitor(
      alert['monitor_type'],
      datadog_query,
      monitor_options
    )
  elsif self.class.same_monitor_type(alert['monitor_type'], existing_alert['type'])
    resp = @dog.update_monitor(
      id,
      datadog_query,
      monitor_options
    )

    # Unmute existing alerts that exceed the max silenced time
    # Datadog does not allow updates to silencing via the update_alert API call.
    silenced = existing_alert['options']['silenced']
    if !@max_mute_minutes.nil?
      silenced = silenced.values.reject do |t|
        t.nil? || t == '*' || t > Time.now.to_i + @max_mute_minutes * 60
      end
      @dog.unmute_monitor(id) if alert_options[:silenced].empty? && silenced.empty?
    elsif alert_options[:silenced].empty? && !silenced.empty?
      @dog.unmute_monitor(id)
    end
  else
    # Need to recreate alert with new monitor type
    resp = @dog.delete_monitor(id)
    code = resp[0].to_i
    unless code >= 300 || code == -1
      resp = @dog.monitor(
        alert['monitor_type'],
        datadog_query,
        monitor_options
      )
    end
  end
  resp
end