Class: Interferon::Destinations::Datadog

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/interferon/destinations/datadog.rb

Constant Summary collapse

ALERT_KEY =
'This alert was created via the alerts framework'.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Logging

configure_logger_for, #log, #statsd

Constructor Details

#initialize(options) ⇒ Datadog

Returns a new instance of Datadog.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/interferon/destinations/datadog.rb', line 16

def initialize(options)
  %w(app_key api_key).each do |req|
    unless options[req]
      raise ArgumentError, "missing required argument #{req}"
    end
  end

  # Set dogapi timeout explicitly
  api_timeout = options['api_timeout'] || 15

  # Default parameters of Dogapi Client initialize() can be referenced from link below:
  # (as of this writing)
  # https://github.com/DataDog/dogapi-rb/blob/master/lib/dogapi/facade.rb#L14
  args = [
    options['api_key'],
    options['app_key'],
    nil, # host to talk to
    nil, # device
    true, # silent?
    api_timeout, # API timeout
  ]
  @dog = Dogapi::Client.new(*args)

  @existing_alerts = nil
  @dry_run = options['dry_run']

  # Datadog communication threads
  @concurrency = options['concurrency'] || 10
  # Fetch page size
  @page_size = options['page_size'] || 1000

  # configure retries
  @retries = options['retries'] || 3

  @stats = {
    alerts_created: 0,
    alerts_to_be_created: 0,
    alerts_updated: 0,
    alerts_to_be_updated: 0,
    alerts_deleted: 0,
    alerts_to_be_deleted: 0,
    alerts_silenced: 0,
    api_successes: 0,
    api_client_errors: 0,
    api_unknown_errors: 0,
    manually_created_alerts: 0,
  }
end

Instance Attribute Details

#concurrencyObject

Returns the value of attribute concurrency.



13
14
15
# File 'lib/interferon/destinations/datadog.rb', line 13

def concurrency
  @concurrency
end

Class Method Details

.generate_message(message, people) ⇒ Object



69
70
71
# File 'lib/interferon/destinations/datadog.rb', line 69

def self.generate_message(message, people)
  [message, ALERT_KEY, people.sort.map { |p| "@#{p}" }].flatten.join("\n")
end

.normalize_monitor_type(monitor_type) ⇒ Object



307
308
309
310
311
312
313
# File 'lib/interferon/destinations/datadog.rb', line 307

def self.normalize_monitor_type(monitor_type)
  # Convert 'query alert' type to 'metric alert' type. They can used interchangeably when
  # submitting monitors to Datadog. Datadog will automatically do the conversion to 'query
  # alert' for a "complex" query that includes multiple metrics/tags while using 'metric alert'
  # for monitors that include a single scope/metric.
  monitor_type == 'query alert' ? 'metric alert' : monitor_type
end

.same_alerts(alert, people, alert_api_json) ⇒ Object



319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# File 'lib/interferon/destinations/datadog.rb', line 319

def self.same_alerts(alert, people, alert_api_json)
  prev_alert = {
    monitor_type: normalize_monitor_type(alert_api_json['type']),
    query: alert_api_json['query'].strip,
    message: alert_api_json['message'].strip,
    evaluation_delay: alert_api_json['options']['evaluation_delay'],
    notify_no_data: alert_api_json['options']['notify_no_data'],
    notify_audit: alert_api_json['options']['notify_audit'],
    no_data_timeframe: alert_api_json['options']['no_data_timeframe'],
    silenced: alert_api_json['options']['silenced'],
    thresholds: alert_api_json['options']['thresholds'],
    timeout_h: alert_api_json['options']['timeout_h'],
  }

  new_alert = {
    monitor_type: normalize_monitor_type(alert['monitor_type']),
    query: alert['metric']['datadog_query'],
    message: generate_message(alert['message'], people).strip,
    evaluation_delay: alert['evaluation_delay'],
    notify_no_data: alert['notify_no_data'],
    notify_audit: alert['notify']['audit'],
    no_data_timeframe: alert['no_data_timeframe'],
    silenced: alert['silenced'],
    thresholds: alert['thresholds'],
    timeout_h: alert['timeout_h'],
  }

  unless alert['require_full_window'].nil?
    prev_alert[:require_full_window] = alert_api_json['options']['require_full_window']
    new_alert[:require_full_window] = alert['require_full_window']
  end

  prev_alert == new_alert
end

.same_monitor_type(monitor_type_a, monitor_type_b) ⇒ Object



315
316
317
# File 'lib/interferon/destinations/datadog.rb', line 315

def self.same_monitor_type(monitor_type_a, monitor_type_b)
  normalize_monitor_type(monitor_type_a) == normalize_monitor_type(monitor_type_b)
end

Instance Method Details

#api_errorsObject



65
66
67
# File 'lib/interferon/destinations/datadog.rb', line 65

def api_errors
  @api_errors ||= []
end

#create_alert(alert, people) ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/interferon/destinations/datadog.rb', line 131

def create_alert(alert, people)
  # create a message which includes the notifications
  # Datadog may have a race condition where alerts created in a bad state may be triggered
  # during the dry-run creation process. Delete people from dry-run alerts to avoid this
  message = self.class.generate_message(alert['message'], people)

  # create the hash of options to send to datadog
  alert_options = {
    notify_audit: alert['notify']['audit'],
    notify_no_data: alert['notify_no_data'],
    no_data_timeframe: alert['no_data_timeframe'],
    silenced: alert['silenced'],
    timeout_h: alert['timeout_h'],
  }

  unless alert['evaluation_delay'].nil?
    alert_options[:evaluation_delay] = alert['evaluation_delay']
  end

  unless alert['require_full_window'].nil?
    alert_options[:require_full_window] = alert['require_full_window']
  end

  unless alert['thresholds'].nil?
    alert_options[:thresholds] = alert['thresholds']
  end

  datadog_query = alert['metric']['datadog_query']
  existing_alert = existing_alerts[alert['name']]

  # new alert, create it
  if existing_alert.nil?
    action = :creating
    resp = create_datadog_alert(alert, datadog_query, message, alert_options)
  else
    # existing alert, modify it
    action = :updating
    resp = update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert)
  end

  # log whenever we've encountered errors
  code = resp[0].to_i
  log_datadog_response_code(resp, code, action, alert)

  # assume this was a success
  unless code >= 400 || code == -1
    # assume this was a success
    @stats[:alerts_created] += 1 if action == :creating
    @stats[:alerts_updated] += 1 if action == :updating
    @stats[:alerts_silenced] += 1 unless alert_options[:silenced].empty?
  end

  id = resp[1].nil? ? nil : [resp[1]['id']]
  # lets key alerts by their name
  [alert['name'], id]
end

#create_datadog_alert(alert, datadog_query, message, alert_options) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/interferon/destinations/datadog.rb', line 188

def create_datadog_alert(alert, datadog_query, message, alert_options)
  @stats[:alerts_to_be_created] += 1
  new_alert_text = <<-EOM
Query:
#{datadog_query}
Message:
#{message}
Options:
#{alert_options}
EOM
  log.info("creating new alert #{alert['name']}: #{new_alert_text}")

  @dog.monitor(
    alert['monitor_type'],
    datadog_query,
    name: alert['name'],
    message: @dry_run ? self.class.generate_message(alert, []) : message,
    options: alert_options
  )
end

#existing_alertsObject



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/interferon/destinations/datadog.rb', line 102

def existing_alerts
  unless @existing_alerts
    alerts = fetch_existing_alerts

    # key alerts by name
    @existing_alerts = {}
    alerts.each do |alert|
      existing_alert = @existing_alerts[alert['name']]
      if existing_alert.nil?
        alert['id'] = [alert['id']]
        @existing_alerts[alert['name']] = alert
      else
        existing_alert['id'] << alert['id']
      end
    end

    # count how many are manually created
    @stats[:manually_created_alerts] = \
      @existing_alerts.reject { |_n, a| a['message'].include?(ALERT_KEY) }.length

    log.info 'datadog: found %d existing alerts; %d were manually created' % [
      @existing_alerts.length,
      @stats[:manually_created_alerts],
    ]
  end

  @existing_alerts
end

#fetch_existing_alertsObject



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/interferon/destinations/datadog.rb', line 73

def fetch_existing_alerts
  alerts = Queue.new
  has_more = true

  Parallel.map_with_index(-> { has_more || Parallel::Stop },
                          in_threads: @concurrency) do |_, page|
    successful = false
    @retries.downto(0) do
      resp = @dog.get_all_monitors(page: page, page_size: @page_size)
      code = resp[0].to_i
      if code != 200
        log.info("Failed to retrieve existing alerts from datadog. #{code}: #{resp[1].inspect}")
      else
        alerts_page = resp[1]
        has_more = false if alerts_page.length < @page_size
        alerts_page.map { |alert| alerts.push(alert) }
        successful = true
        break
      end
    end

    unless successful
      # Out of retries
      raise 'Retries exceeded for fetching data from datadog.'
    end
  end
  Array.new(alerts.size) { alerts.pop }
end

#log_datadog_response_code(resp, code, action, alert = nil) ⇒ Object



371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# File 'lib/interferon/destinations/datadog.rb', line 371

def log_datadog_response_code(resp, code, action, alert = nil)
  # log whenever we've encountered errors
  if code != 200 && !alert.nil?
    api_errors << "#{code} on alert #{alert['name']}"
  end

  # client error
  if code == 400
    @stats[:api_client_errors] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 1, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"])
      log.error("client error while #{action} alert '#{alert['name']}';" \
                " query was '#{alert['metric']['datadog_query']}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end

    # unknown (prob. datadog) error:
  elsif code > 400 || code == -1
    @stats[:api_unknown_errors] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 1, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"])
      log.error("unknown error while #{action} alert '#{alert['name']}':" \
                " query was '#{alert['metric']['datadog_query']}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end
  else
    @stats[:api_successes] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 1, tags: ["alert:#{alert}"])
    end
  end
end

#need_update(alert_people_pair, existing_alerts_from_api) ⇒ Object



301
302
303
304
305
# File 'lib/interferon/destinations/datadog.rb', line 301

def need_update(alert_people_pair, existing_alerts_from_api)
  alert, people = alert_people_pair
  existing = existing_alerts_from_api[alert['name']]
  existing.nil? || !self.class.same_alerts(alert, people, existing)
end

#remove_alert(alert) ⇒ Object



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/interferon/destinations/datadog.rb', line 271

def remove_alert(alert)
  if alert['message'].include?(ALERT_KEY)
    @stats[:alerts_to_be_deleted] += 1
    log.info("deleting alert: #{alert['name']}")

    unless @dry_run
      alert['id'].each do |alert_id|
        resp = @dog.delete_monitor(alert_id)
        code = resp[0].to_i
        log_datadog_response_code(resp, code, :deleting)

        unless code >= 300 || code == -1
          # assume this was a success
          @stats[:alerts_deleted] += 1
        end
      end
    end
  else
    log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
  end
end

#remove_alert_by_id(alert_id) ⇒ Object



293
294
295
296
297
298
299
# File 'lib/interferon/destinations/datadog.rb', line 293

def remove_alert_by_id(alert_id)
  # This should only be used by dry-run to clean up created dry-run alerts
  log.debug("deleting alert, id: #{alert_id}")
  resp = @dog.delete_monitor(alert_id)
  code = resp[0].to_i
  log_datadog_response_code(resp, code, :deleting)
end

#report_statsObject



354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
# File 'lib/interferon/destinations/datadog.rb', line 354

def report_stats
  @stats.each do |k, v|
    statsd.gauge("datadog.#{k}", v)
  end

  log.info(
    'datadog: successfully created (%d/%d), updated (%d/%d), and deleted (%d/%d) alerts' % [
      @stats[:alerts_created],
      @stats[:alerts_to_be_created],
      @stats[:alerts_updated],
      @stats[:alerts_to_be_updated],
      @stats[:alerts_deleted],
      @stats[:alerts_to_be_deleted],
    ]
  )
end

#update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert) ⇒ Object



209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# File 'lib/interferon/destinations/datadog.rb', line 209

def update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert)
  @stats[:alerts_to_be_updated] += 1
  id = existing_alert['id'][0]

  new_alert_text = <<-EOM.strip
Query:
#{datadog_query.strip}
Message:
#{message.strip}
Options:
#{alert_options}
EOM
  existing_alert_text = <<-EOM.strip
Query:
#{existing_alert['query'].strip}
Message:
#{existing_alert['message'].strip}
Options:
#{alert_options}
EOM
  diff = Diffy::Diff.new(existing_alert_text, new_alert_text, context: 1)
  log.info("updating existing alert #{id} (#{alert['name']}):\n#{diff}")

  if @dry_run
    resp = @dog.monitor(
      alert['monitor_type'],
      datadog_query,
      name: alert['name'],
      message: self.class.generate_message(alert, []),
      options: alert_options
    )
  elsif self.class.same_monitor_type(alert['monitor_type'], existing_alert['type'])
    resp = @dog.update_monitor(
      id,
      datadog_query,
      name: alert['name'],
      message: message,
      options: alert_options
    )

    # Unmute existing alerts that have been unsilenced.
    # Datadog does not allow updates to silencing via the update_alert API call.
    if !existing_alert['options']['silenced'].empty? && alert_options[:silenced].empty?
      @dog.unmute_monitor(id)
    end
  else
    # Need to recreate alert with new monitor type
    resp = @dog.delete_monitor(id)
    code = resp[0].to_i
    unless code >= 300 || code == -1
      resp = @dog.monitor(
        alert['monitor_type'],
        datadog_query,
        name: alert['name'],
        message: message,
        options: alert_options
      )
    end
  end
  resp
end